Use Closed-Form Policy to Play BreakoutNoFrameskip-v4¶

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('BreakoutNoFrameskip-v4')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
00:03:37 [INFO] env: <AtariEnv<BreakoutNoFrameskip-v4>>
00:03:37 [INFO] action_space: Discrete(4)
00:03:37 [INFO] observation_space: Box(0, 255, (210, 160, 3), uint8)
00:03:37 [INFO] reward_range: (-inf, inf)
00:03:37 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:03:37 [INFO] _max_episode_steps: 400000
00:03:37 [INFO] _elapsed_steps: None
00:03:37 [INFO] id: BreakoutNoFrameskip-v4
00:03:37 [INFO] entry_point: gym.envs.atari:AtariEnv
00:03:37 [INFO] reward_threshold: None
00:03:37 [INFO] nondeterministic: False
00:03:37 [INFO] max_episode_steps: 400000
00:03:37 [INFO] _kwargs: {'game': 'breakout', 'obs_type': 'image', 'frameskip': 1}
00:03:37 [INFO] _env_name: BreakoutNoFrameskip
In [3]:
def calc_mean(locs, value=float('nan')):
    indices = locs.nonzero()[0]
    if len(indices) == 0:
        return value
    return np.nanmean(indices)


class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        self.pad_x = 72.
        self.ball_x = 72.
        self.ball_y = 95.

    def step(self, observation, reward, terminated):
        pixels = np.flipud(observation[95:190, 8:152, 0]) == 200
        pad_x = calc_mean(pixels[0])
        ball_x = calc_mean(pixels[1:].any(axis=0))
        ball_y = calc_mean(pixels[1:].any(axis=1)) + 1.

        pad_xv = pad_x - self.pad_x
        ball_xv = ball_x - self.ball_x
        ball_yv = ball_y - self.ball_y
        target_x = abs(ball_x - ball_xv / ball_yv * ball_y)
        pred_x = pad_x + pad_xv / 2. + np.random.randn() / 3.
        if pred_x < target_x - 1 and pred_x + 5. < pixels.shape[1]:
            action = 2 # right
        elif pred_x > target_x + 1 and pred_x - 5. >= 0:
            action = 3 # left
        else:
            action = 1 # no move
        self.pad_x = pad_x
        self.ball_x = ball_x
        self.ball_y = ball_y
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)
In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:03:37 [INFO] ==== test ====
00:15:32 [INFO] test episode 0: reward = 397.00, steps = 400000
00:16:12 [INFO] test episode 1: reward = 860.00, steps = 22312
00:27:02 [INFO] test episode 2: reward = 864.00, steps = 400000
00:27:51 [INFO] test episode 3: reward = 864.00, steps = 27045
00:28:42 [INFO] test episode 4: reward = 864.00, steps = 28393
00:30:24 [INFO] test episode 5: reward = 801.00, steps = 57369
00:41:15 [INFO] test episode 6: reward = 428.00, steps = 400000
00:42:26 [INFO] test episode 7: reward = 864.00, steps = 39369
00:53:12 [INFO] test episode 8: reward = 864.00, steps = 400000
00:54:33 [INFO] test episode 9: reward = 848.00, steps = 46194
00:55:14 [INFO] test episode 10: reward = 824.00, steps = 23012
00:55:44 [INFO] test episode 11: reward = 669.00, steps = 16171
00:56:15 [INFO] test episode 12: reward = 745.00, steps = 17462
00:57:22 [INFO] test episode 13: reward = 629.00, steps = 37797
00:58:10 [INFO] test episode 14: reward = 864.00, steps = 27505
00:59:23 [INFO] test episode 15: reward = 864.00, steps = 42170
01:10:13 [INFO] test episode 16: reward = 856.00, steps = 400000
01:11:47 [INFO] test episode 17: reward = 453.00, steps = 53991
01:13:15 [INFO] test episode 18: reward = 843.00, steps = 50266
01:13:37 [INFO] test episode 19: reward = 432.00, steps = 12492
01:14:09 [INFO] test episode 20: reward = 810.00, steps = 18044
01:24:57 [INFO] test episode 21: reward = 863.00, steps = 400000
01:25:09 [INFO] test episode 22: reward = 215.00, steps = 6719
01:25:26 [INFO] test episode 23: reward = 386.00, steps = 9549
01:27:16 [INFO] test episode 24: reward = 864.00, steps = 63149
01:28:45 [INFO] test episode 25: reward = 864.00, steps = 51313
01:38:44 [INFO] test episode 26: reward = 864.00, steps = 400000
01:39:12 [INFO] test episode 27: reward = 431.00, steps = 16731
01:39:39 [INFO] test episode 28: reward = 548.00, steps = 16400
01:40:09 [INFO] test episode 29: reward = 843.00, steps = 18426
01:50:03 [INFO] test episode 30: reward = 864.00, steps = 400000
01:59:52 [INFO] test episode 31: reward = 864.00, steps = 400000
02:00:53 [INFO] test episode 32: reward = 848.00, steps = 37361
02:02:06 [INFO] test episode 33: reward = 859.00, steps = 45441
02:03:28 [INFO] test episode 34: reward = 864.00, steps = 50338
02:03:59 [INFO] test episode 35: reward = 476.00, steps = 19107
02:04:36 [INFO] test episode 36: reward = 425.00, steps = 22636
02:05:49 [INFO] test episode 37: reward = 847.00, steps = 44566
02:15:37 [INFO] test episode 38: reward = 864.00, steps = 400000
02:25:35 [INFO] test episode 39: reward = 856.00, steps = 400000
02:35:31 [INFO] test episode 40: reward = 856.00, steps = 400000
02:35:58 [INFO] test episode 41: reward = 406.00, steps = 16452
02:36:49 [INFO] test episode 42: reward = 728.00, steps = 31606
02:38:02 [INFO] test episode 43: reward = 864.00, steps = 45497
02:39:42 [INFO] test episode 44: reward = 424.00, steps = 61532
02:49:38 [INFO] test episode 45: reward = 850.00, steps = 400000
02:59:29 [INFO] test episode 46: reward = 864.00, steps = 400000
03:00:29 [INFO] test episode 47: reward = 438.00, steps = 37438
03:01:17 [INFO] test episode 48: reward = 853.00, steps = 29429
03:11:15 [INFO] test episode 49: reward = 424.00, steps = 400000
03:21:22 [INFO] test episode 50: reward = 425.00, steps = 400000
03:22:48 [INFO] test episode 51: reward = 864.00, steps = 53779
03:23:11 [INFO] test episode 52: reward = 453.00, steps = 14042
03:24:04 [INFO] test episode 53: reward = 450.00, steps = 33371
03:25:30 [INFO] test episode 54: reward = 856.00, steps = 53673
03:26:28 [INFO] test episode 55: reward = 864.00, steps = 35714
03:27:17 [INFO] test episode 56: reward = 425.00, steps = 29852
03:27:54 [INFO] test episode 57: reward = 864.00, steps = 22913
03:28:49 [INFO] test episode 58: reward = 835.00, steps = 34279
03:38:48 [INFO] test episode 59: reward = 831.00, steps = 400000
03:48:36 [INFO] test episode 60: reward = 864.00, steps = 400000
03:58:58 [INFO] test episode 61: reward = 615.00, steps = 400000
03:59:46 [INFO] test episode 62: reward = 864.00, steps = 29846
04:00:34 [INFO] test episode 63: reward = 619.00, steps = 29569
04:01:43 [INFO] test episode 64: reward = 864.00, steps = 43296
04:02:29 [INFO] test episode 65: reward = 771.00, steps = 28955
04:03:41 [INFO] test episode 66: reward = 532.00, steps = 45177
04:04:21 [INFO] test episode 67: reward = 864.00, steps = 24223
04:14:12 [INFO] test episode 68: reward = 428.00, steps = 400000
04:14:50 [INFO] test episode 69: reward = 856.00, steps = 24051
04:15:35 [INFO] test episode 70: reward = 442.00, steps = 28101
04:16:10 [INFO] test episode 71: reward = 782.00, steps = 21433
04:16:46 [INFO] test episode 72: reward = 807.00, steps = 22304
04:26:37 [INFO] test episode 73: reward = 431.00, steps = 400000
04:27:37 [INFO] test episode 74: reward = 857.00, steps = 37242
04:37:22 [INFO] test episode 75: reward = 864.00, steps = 400000
04:47:07 [INFO] test episode 76: reward = 864.00, steps = 400000
04:48:11 [INFO] test episode 77: reward = 551.00, steps = 40579
04:48:37 [INFO] test episode 78: reward = 504.00, steps = 15823
04:49:10 [INFO] test episode 79: reward = 840.00, steps = 20024
04:49:46 [INFO] test episode 80: reward = 738.00, steps = 22279
04:51:01 [INFO] test episode 81: reward = 864.00, steps = 46719
05:00:49 [INFO] test episode 82: reward = 428.00, steps = 400000
05:10:40 [INFO] test episode 83: reward = 860.00, steps = 400000
05:12:03 [INFO] test episode 84: reward = 864.00, steps = 52588
05:12:38 [INFO] test episode 85: reward = 857.00, steps = 21521
05:13:16 [INFO] test episode 86: reward = 436.00, steps = 23530
05:14:00 [INFO] test episode 87: reward = 860.00, steps = 26924
05:23:50 [INFO] test episode 88: reward = 428.00, steps = 400000
05:24:14 [INFO] test episode 89: reward = 415.00, steps = 14829
05:34:04 [INFO] test episode 90: reward = 864.00, steps = 400000
05:34:32 [INFO] test episode 91: reward = 455.00, steps = 17489
05:36:00 [INFO] test episode 92: reward = 863.00, steps = 53499
05:37:25 [INFO] test episode 93: reward = 864.00, steps = 53216
05:47:26 [INFO] test episode 94: reward = 836.00, steps = 400000
05:57:22 [INFO] test episode 95: reward = 852.00, steps = 400000
06:07:16 [INFO] test episode 96: reward = 428.00, steps = 400000
06:17:03 [INFO] test episode 97: reward = 864.00, steps = 400000
06:17:54 [INFO] test episode 98: reward = 853.00, steps = 31289
06:18:26 [INFO] test episode 99: reward = 809.00, steps = 19906
06:18:26 [INFO] average episode reward = 715.19 ± 191.04
In [5]:
env.close()