-
-
Save vsooda/68b0b07ea31c441684ac73af3dea449c to your computer and use it in GitHub Desktop.
Training a Neural Network ATARI Pong agent with Policy Gradients from raw pixels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python2.7 | |
# coding=utf-8 | |
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """ | |
import numpy as np | |
import cPickle as pickle | |
import gym | |
# hyperparameters | |
H = 200 # number of hidden layer neurons | |
batch_size = 10 # every how many episodes to do a param update? | |
learning_rate = 1e-4 | |
gamma = 0.99 # discount factor for reward | |
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 | |
resume = False # resume from previous checkpoint? | |
render = True | |
# model initialization | |
D = 80 * 80 # input dimensionality: 80x80 grid | |
if resume: | |
model = pickle.load(open('save.p', 'rb')) | |
else: | |
model = {} | |
model['W1'] = np.random.randn(H, D) / np.sqrt(D) # "Xavier" initialization | |
model['W2'] = np.random.randn(H) / np.sqrt(H) | |
grad_buffer = {k: np.zeros_like(v) for k, v in model.iteritems()} # update buffers that add up gradients over a batch | |
rmsprop_cache = {k: np.zeros_like(v) for k, v in model.iteritems()} # rmsprop memory | |
def sigmoid(x): | |
return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1] | |
def prepro(I): | |
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """ | |
I = I[35:195] # crop | |
I = I[::2, ::2, 0] # downsample by factor of 2 | |
I[I == 144] = 0 # erase background (background type 1) | |
I[I == 109] = 0 # erase background (background type 2) | |
I[I != 0] = 1 # everything else (paddles, ball) just set to 1 | |
return I.astype(np.float).ravel() | |
def discount_rewards(r): | |
""" take 1D float array of rewards and compute discounted reward """ | |
discounted_r = np.zeros_like(r) | |
running_add = 0 | |
for t in reversed(xrange(0, r.size)): | |
if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) | |
running_add = running_add * gamma + r[t] | |
discounted_r[t] = running_add | |
return discounted_r | |
def policy_forward(x): | |
h = np.dot(model['W1'], x) | |
h[h < 0] = 0 # ReLU nonlinearity | |
logp = np.dot(model['W2'], h) | |
p = sigmoid(logp) | |
return p, h # return probability of taking action 2, and hidden state | |
def policy_backward(eph, epdlogp): | |
""" backward pass. (eph is array of intermediate hidden states) """ | |
dW2 = np.dot(eph.T, epdlogp).ravel() | |
dh = np.outer(epdlogp, model['W2']) | |
dh[eph <= 0] = 0 # backpro prelu | |
dW1 = np.dot(dh.T, epx) | |
return {'W1': dW1, 'W2': dW2} | |
env = gym.make("Pong-v0") | |
observation = env.reset() | |
prev_x = None # used in computing the difference frame | |
xs, hs, dlogps, drs = [], [], [], [] | |
running_reward = None | |
reward_sum = 0 | |
episode_number = 0 | |
while True: | |
if render: env.render() | |
# preprocess the observation, set input to network to be difference image | |
# 截取感兴趣区域 | |
cur_x = prepro(observation) | |
# 差分图像;用来表示前后两帧的运动状态 | |
x = cur_x - prev_x if prev_x is not None else np.zeros(D) | |
prev_x = cur_x | |
# forward the policy network and sample an action from the returned probability | |
# 策略网络就是简单的前向神经网络 | |
aprob, h = policy_forward(x) | |
# 通过采样决定向上运动还是向下运动 | |
action = 2 if np.random.uniform() < aprob else 3 # roll the dice! | |
# record various intermediates (needed later for backprop) | |
xs.append(x) # observation | |
hs.append(h) # hidden state | |
y = 1 if action == 2 else 0 # a "fake label" | |
dlogps.append( | |
y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused) | |
# step the environment and get new measurements | |
# 由gym提供运行环境。奖赏值,下次观测值,是否进入下一个episode | |
observation, reward, done, info = env.step(action) | |
reward_sum += reward | |
drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) | |
# 构造reward序列。每个pk回合只有一个reward非零. 在done为正之前,一般经历20左右个回合 | |
if done: # an episode finished | |
episode_number += 1 | |
# stack together all inputs, hidden states, action gradients, and rewards for this episode | |
epx = np.vstack(xs) | |
eph = np.vstack(hs) | |
epdlogp = np.vstack(dlogps) | |
# [[0.5] | |
# [0.4600174] | |
# [-0.58261271] | |
# ..., | |
# [-0.41131332] | |
# [-0.61304171] | |
# [0.40808607]] | |
epr = np.vstack(drs) | |
# [[0.] | |
# [0.] | |
# [0.] | |
# ..., 注意这中间还会有非零值 | |
# [0.] | |
# [0.] | |
# [-1.]] | |
xs, hs, dlogps, drs = [], [], [], [] # reset array memory | |
# 计算gama折扣累积奖赏,并标准化 | |
# compute the discounted reward backwards through time | |
discounted_epr = discount_rewards(epr) | |
# [[-0.40473197] | |
# [-0.40882017] | |
# [-0.41294967] | |
# ..., | |
# [-0.9801] | |
# [-0.99] | |
# [-1.]] | |
# standardize the rewards to be unit normal (helps control the gradient estimator variance) | |
discounted_epr -= np.mean(discounted_epr) | |
discounted_epr /= np.std(discounted_epr) | |
# [[0.08438717] | |
# [0.07672671] | |
# [0.06898887] | |
# ..., | |
# [-0.99373576] | |
# [-1.01228635] | |
# [-1.03102432]] | |
# 通过计算的奖赏值,计算需要反向传播的误差 (pg算法) | |
epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) | |
# [[0.04219359] | |
# [0.03529562] | |
# [-0.04019379] | |
# ..., | |
# [0.40873675] | |
# [0.62057375] | |
# [-0.42074666]] | |
# 后向传播求梯度 | |
grad = policy_backward(eph, epdlogp) | |
for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch | |
# perform rmsprop parameter update every batch_size episodes | |
# rmsprop优化 | |
if episode_number % batch_size == 0: | |
for k, v in model.iteritems(): | |
g = grad_buffer[k] # gradient | |
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g ** 2 | |
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5) | |
grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer | |
# boring book-keeping | |
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | |
print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward) | |
if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb')) | |
reward_sum = 0 | |
observation = env.reset() # reset env | |
prev_x = None | |
# 一回合中,帧数不一定。最终击败对方则reward为1,被击败reward为-1,其他reward为0 | |
# 所以在一回合可以构造多个标记对,最后一个标记的reward非0,其他为0 | |
# 需要通过gama-折扣累积奖赏算法计算出其他的reward值 | |
if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. | |
print ('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment