Last active
January 29, 2019 23:37
-
-
Save greydanus/7cef68683ec955720ddde6b3edf8820e to your computer and use it in GitHub Desktop.
Solution to the Cartpole problem using Policy Gradients in TensorFlow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Solution to the Cartpole problem using Policy Gradients in Tensorflow.''' | |
# written October 2016 by Sam Greydanus | |
# inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5 | |
import numpy as np | |
import gym | |
import tensorflow as tf | |
# hyperparameters | |
n_obs = 4 # dimensionality of observations | |
h = 128 # hidden layer neurons | |
n_actions = 2 # number of available actions | |
learning_rate = 1e-2 # how rapidly to update parameters | |
gamma = .9 # reward discount factor | |
decay = 0.9 # decay rate for RMSProp gradients | |
# gamespace | |
env = gym.make("CartPole-v0") | |
observation = env.reset() | |
xs,rs,ys = [],[],[] # environment info | |
running_reward = 10 # worst case is ~10 for cartpole | |
reward_sum = 0 | |
episode_number = 0 | |
max_steps = 1000 # should converge around 300 | |
# initialize model | |
tf_model = {} | |
with tf.variable_scope('layer_one',reuse=False): | |
xavier_l1 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(n_obs), dtype=tf.float32) | |
tf_model['W1'] = tf.get_variable("W1", [n_obs, h], initializer=xavier_l1) | |
with tf.variable_scope('layer_two',reuse=False): | |
xavier_l2 = tf.truncated_normal_initializer(mean=0, stddev=1./np.sqrt(h), dtype=tf.float32) | |
tf_model['W2'] = tf.get_variable("W2", [h,n_actions], initializer=xavier_l2) | |
# tensorflow operations | |
def tf_discount_rewards(tf_r): #tf_r ~ [game_steps,1] | |
discount_f = lambda a, v: a*gamma + v; | |
tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False])) | |
tf_discounted_r = tf.reverse(tf_r_reverse,[True, False]) | |
return tf_discounted_r | |
def tf_policy_forward(x): #x ~ [1,D] | |
h = tf.matmul(x, tf_model['W1']) | |
h = tf.nn.relu(h) | |
logp = tf.matmul(h, tf_model['W2']) | |
p = tf.nn.softmax(logp) | |
return p | |
# tf placeholders | |
tf_x = tf.placeholder(dtype=tf.float32, shape=[None, n_obs],name="tf_x") | |
tf_y = tf.placeholder(dtype=tf.float32, shape=[None, n_actions],name="tf_y") | |
tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr") | |
# tf reward processing (need tf_discounted_epr for policy gradient wizardry) | |
tf_discounted_epr = tf_discount_rewards(tf_epr) | |
tf_mean, tf_variance= tf.nn.moments(tf_discounted_epr, [0], shift=None, name="reward_moments") | |
tf_discounted_epr -= tf_mean | |
tf_discounted_epr /= tf.sqrt(tf_variance + 1e-6) | |
# tf optimizer op | |
tf_aprob = tf_policy_forward(tf_x) | |
loss = tf.nn.l2_loss(tf_y-tf_aprob) # this gradient encourages the actions taken | |
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay) | |
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr) | |
train_op = optimizer.apply_gradients(tf_grads) | |
# tf graph initialization | |
sess = tf.InteractiveSession() | |
tf.initialize_all_variables().run() | |
# training loop | |
# stop when running reward exceeds 200 (task is considered solved) | |
while episode_number <= max_steps and running_reward < 200: | |
# if episode_number%50==0: env.render() | |
# stochastically sample a policy from the network | |
x = observation | |
feed = {tf_x: np.reshape(x, (1,-1))} | |
aprob = sess.run(tf_aprob,feed) | |
aprob = aprob[0,:] # we live in a batched world :/ | |
action = np.random.choice(n_actions, p=aprob) | |
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label' | |
# step the environment and get new measurements | |
observation, reward, done, info = env.step(action) | |
reward_sum += reward | |
# record game history | |
xs.append(x) | |
ys.append(label) | |
rs.append(reward) | |
if done: | |
running_reward = running_reward * 0.99 + reward_sum * 0.01 | |
epx = np.vstack(xs) | |
epr = np.vstack(rs) | |
epy = np.vstack(ys) | |
xs,rs,ys = [],[],[] # reset game history | |
feed = {tf_x: epx, tf_epr: epr, tf_y: epy} | |
_ = sess.run(train_op,feed) # parameter update | |
# print some updates | |
if episode_number % 25 == 0: | |
print 'ep: {}, reward: {}, mean reward: {:3f}'.format( | |
episode_number, reward_sum, running_reward) | |
# book-keeping | |
episode_number += 1 | |
observation = env.reset() # reset env | |
reward_sum = 0 | |
if running_reward > 200: | |
print "ep: {}: SOLVED! (running reward hit {} which is greater than 200)".format( | |
episode_number, running_reward) | |
else: | |
print "ep: {}: model did not converge. Try changing the hyperparameters.".format(episode_number) |
I did not get what is happening in line 82
label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
Can you please add some explanation. That would help. I have gone through the pg derivation but cannot relate.
Thanks
Thanks for the codes. Very interesting.
Here I have a question regarding the line 63:
tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=tf_discounted_epr)
What is the purpose of the parameter, grad_loss=tf_discounted_epr.
The TF document says: "A Tensor holding the gradient computed for loss". May I ask what is the difference between the gradient for loss and gradient for valuables?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For some reason I am getting "model did not converge. Try changing the hyperparameters" with the default script. Python 3.5.2 Windows. Could there it be a difference in the MKL?