Created
January 16, 2017 06:15
-
-
Save amoudgl/138db4c55c42f91f4c858294acadb771 to your computer and use it in GitHub Desktop.
Policy Gradient Algorithm for Cartpole Environment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# monte carlo policy gradient algorithm | |
# use neural network to decide the policy | |
# from observations and rewards, update the parameters of the neural networks to optimize the policy | |
import numpy as np | |
import tensorflow as tf | |
import gym | |
from gym import wrappers | |
# initialize constants | |
batch_size = 5 # number of episodes after which parameter update is done | |
train_episodes = 50000 # total number of training episodes | |
hidden_units = 50 # number of hidden units in neural network | |
discount_factor = 0.99 # reward discounting factor | |
epsilon = 1e-9 # epsilon of RMSProp update | |
learning_rate = 0.05 # stepsize in RMSProp update | |
decay_rate = 0.9 # decay rate of RMSProp | |
# build environment | |
env = gym.make("CartPole-v0") | |
env = wrappers.Monitor(env, '/tmp/cartpole-policy-gradient', force=True) | |
n_obv = env.observation_space.shape[0] # size of observation vector from environment | |
n_acts = env.action_space.n # number of actions possible in the given environment | |
tf.logging.set_verbosity(tf.logging.ERROR) | |
tf.set_random_seed(7) | |
# create model | |
obv = tf.placeholder(tf.float64) | |
acts = tf.placeholder(tf.int32) | |
adv = tf.placeholder(tf.float64) | |
W0 = tf.Variable(tf.random_uniform([n_obv, hidden_units], dtype=tf.float64)) | |
b0 = tf.Variable(tf.zeros([hidden_units], dtype=tf.float64)) | |
W1 = tf.Variable(tf.random_uniform([hidden_units, n_acts], dtype=tf.float64)) | |
b1 = tf.Variable(tf.zeros(n_acts, dtype=tf.float64)) | |
params = [W0, b0, W1, b1] | |
N = tf.shape(obv)[0] | |
y = tf.nn.softmax(tf.matmul(tf.tanh(tf.matmul(obv, W0) + b0[None, :]), W1) + b1[None, :]) | |
idx_flattened = tf.range(0, tf.shape(y)[0]) * tf.shape(y)[1] + acts | |
yy = tf.gather(tf.reshape(y, [-1]), idx_flattened) | |
loss = -tf.reduce_sum(tf.mul(tf.log(yy), adv)) / tf.cast(N, tf.float64) | |
train_step = tf.train.RMSPropOptimizer(learning_rate = learning_rate, decay = decay_rate, epsilon = epsilon).minimize(loss, var_list=params) | |
def act(obvs, sess): | |
obvs = obvs.reshape(1, -1) | |
probs = sess.run(y, feed_dict={obv: obvs}) | |
probs = np.asarray(probs) | |
return probs.argmax() | |
# computes the return R = r[i] + discount_factor * r[i + 1] + discount_factor^2 * r[i + 2] | |
def get_discounted_return(r, discount_factor): | |
y = np.zeros(len(r), 'float64') | |
y[-1] = r[-1] | |
for i in reversed(xrange(len(r)-1)): | |
y[i] = r[i] + discount_factor * y[i + 1] | |
return y | |
observation = env.reset() | |
trajectories = [] # list of dictionary of rewards, actions and observations for each episode | |
observations = [] # list of observations for each episode | |
rewards = [] # list of discounted rewards for each episode | |
actions = [] # list of actions for each episode | |
episode = 0 # episode iterator | |
episode_time = 0 # measure time of each episode | |
net_reward = 0 # net reward of each episode | |
iteration = 0 # number of times parameters are updated | |
# start tf session | |
sess = tf.Session() | |
init = tf.global_variables_initializer() | |
sess.run(init) | |
while True: | |
# take action and record everything | |
action = act(observation, sess) | |
(observation, reward, done, _) = env.step(action) | |
observations.append(observation) | |
actions.append(action) | |
rewards.append(reward) | |
net_reward += reward | |
episode_time += 1 | |
if done: | |
# stores the trajectory of episode | |
episode += 1 | |
trajectory = {"rew" : np.array(rewards), "obv" : np.array(observations), "acts" : np.array(actions)} | |
trajectories.append(trajectory) | |
if episode % batch_size == 0: | |
iteration += 1 | |
# get list of all (observations, discounted rewards, actions) of the batch | |
batch_obv = np.concatenate([trajectory["obv"] for trajectory in trajectories]) | |
batch_rets = [get_discounted_return(trajectory["rew"], discount_factor) for trajectory in trajectories] | |
batch_acts = np.concatenate([trajectory["acts"] for trajectory in trajectories]) | |
max_episode_length = max(len(reward) for reward in batch_rets) | |
batch_rew = [np.concatenate([reward, np.zeros(max_episode_length - len(reward))]) for reward in batch_rets] | |
# compute advantages with time dependent baselines | |
baselines = np.mean(batch_rew, axis = 0) | |
batch_adv = np.concatenate([reward - baselines[:len(reward)] for reward in batch_rets]) | |
# compute loss and do policy gradient step | |
sess.run(train_step, feed_dict={obv: batch_obv, acts: batch_acts, adv: batch_adv}) | |
trajectories = [] | |
print("Episode %d finished after %d timesteps, reward = %d" % (episode, episode_time, net_reward)) | |
observation = env.reset() | |
observations = [] | |
actions = [] | |
rewards = [] | |
episode_time = 0 | |
net_reward = 0 | |
if episode > train_episodes: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment