Run with defaults
python vpg.py
| """ | |
| Policy Gradients | |
| 1. Sample paths. | |
| 2. Process paths (compute advantage, baseline, rewards, etc) | |
| 3. Run the paths through the policy (function approximator) | |
| 4. Compute gradients/update policy model weights | |
| 5. Profit?!?! | |
| How we optimize the policy | |
| -------------------------- | |
| L(theta) = sum t=0 to T-1 log policy(action_t | state_t, theta) * A_t | |
| R_t = (sum u=t to T reward_u) | |
| B_t = E [ sum u=t to T lambda^(u-t) * reward_u | state_t] | |
| A_t = R_t - B_t | |
| R_t = reward | |
| A_t = advantage | |
| B_t = baseline | |
| theta = parameters of our policy, most like neural network weights. | |
| The baseline can be thought of as the value function (V). When we evaluate the baseline | |
| of a state we're predict how good our future returns will be given our current state. | |
| So, intuitively if A_t > 0 that means the path we sampled is better than the expectation of | |
| paths from the current state. Likewise, if A_t < 0, it's worse. Concretely, if A_t > 0 we want | |
| more paths like that, if A_t < 0 we want less paths like that. Theta will be updated during training | |
| to reflect this. | |
| Types of parameterized policies | |
| ------------------------------- | |
| Map s (state) to an output vector u | |
| 1. If the action is from a discrete set, the network maps s to a vector of probabilities (softmax) | |
| 2. If the action is continuous, then we map s to the mean/variance of a Gaussian distribution | |
| (diagonal covariance that does not depend on s) | |
| 3. If a is binary valued, we use a single output, the probability of outputting 1 (although | |
| we could also just use 1.) | |
| TODO: implement baseline | |
| TODO: implement generalized advantage estimation | |
| """ | |
| from __future__ import absolute_import | |
| from __future__ import print_function | |
| from __future__ import division | |
| from six.moves import range | |
| from gym.spaces import Box, Discrete | |
| from scipy.signal import lfilter | |
| import gym | |
| import tensorflow as tf | |
| import numpy as np | |
| import argparse | |
| def flatten_space(space): | |
| if isinstance(space, Box): | |
| return np.prod(space.shape) | |
| elif isinstance(space, Discrete): | |
| return space.n | |
| else: | |
| raise ValueError("Env must be either Box or Discrete.") | |
| def discount_cumsum(x, gamma): | |
| return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] | |
| class CategoricalPolicy(object): | |
| def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): | |
| # Placeholder Inputs | |
| self._observations = tf.placeholder(tf.float32, shape=[None, in_dim], name="observations") | |
| self._actions = tf.placeholder(tf.int32, name="actions") | |
| self._advantages = tf.placeholder(tf.float32, name="advantages") | |
| self._opt = optimizer | |
| self._sess = session | |
| h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh) | |
| probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax) | |
| # I believe this is faster if on the CPU | |
| with tf.device("/cpu:0"): | |
| # NOTE: Doesn't currently work due to gather_nd gradient not being currently implemented | |
| # inds = tf.transpose(tf.pack([tf.range(tf.shape(probs)[0]), self._actions])) | |
| # log_lik = tf.log(tf.gather_nd(probs, inds)) | |
| idxs_flattened = tf.range(0, tf.shape(probs)[0]) * tf.shape(probs)[1] + self._actions | |
| probs_vec = tf.gather(tf.reshape(probs, [-1]), idxs_flattened) | |
| log_lik = tf.log(probs_vec + 1e-8) | |
| act_op = probs[0, :] | |
| surr_loss = -tf.reduce_mean(log_lik * self._advantages, name="loss_op") | |
| grads_and_vars = self._opt.compute_gradients(surr_loss) | |
| train_op = self._opt.apply_gradients(grads_and_vars, name="train_op") | |
| self._act_op = act_op | |
| self._loss_op = surr_loss | |
| self._train_op = train_op | |
| def act(self, observation): | |
| # expect observation to be shape(1, self.observation_space) | |
| a = self._sess.run(self._act_op, feed_dict={self._observations: observation}) | |
| cs = np.cumsum(a) | |
| idx = sum(cs < np.random.rand(len(cs))) | |
| return idx | |
| def train(self, observations, actions, advantages): | |
| loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages}) | |
| return loss | |
| class PolicyOptimizer(object): | |
| def __init__(self, env, policy, baseline, n_iter, n_episode, path_length, | |
| gamma=.99): | |
| self.policy = policy | |
| self.baseline = baseline | |
| self.env = env | |
| self.n_iter = n_iter | |
| self.n_episode = n_episode | |
| self.path_length = path_length | |
| self.gamma = gamma | |
| def sample_path(self): | |
| obs = [] | |
| actions = [] | |
| rewards = [] | |
| ob = self.env.reset() | |
| for _ in range(self.path_length): | |
| a = self.policy.act(ob.reshape(1, -1)) | |
| next_ob, r, done, _ = self.env.step(a) | |
| obs.append(ob) | |
| actions.append(a) | |
| rewards.append(r) | |
| ob = next_ob | |
| if done: | |
| break | |
| return dict( | |
| observations=np.array(obs), | |
| actions=np.array(actions), | |
| rewards=np.array(rewards), | |
| ) | |
| def process_paths(self, paths): | |
| for p in paths: | |
| # TODO: compute baseline | |
| # b = self.baseline.predict(p) | |
| b = 0 | |
| r = discount_cumsum(p["rewards"], self.gamma) | |
| a = r - b | |
| p["returns"] = r | |
| # p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize | |
| p["advantages"] = a | |
| p["baselines"] = b | |
| obs = np.concatenate([ p["observations"] for p in paths ]) | |
| actions = np.concatenate([ p["actions"] for p in paths ]) | |
| rewards = np.concatenate([ p["rewards"] for p in paths ]) | |
| advantages = np.concatenate([ p["advantages"] for p in paths ]) | |
| # TODO: fit baseline | |
| # self.baseline.fit(paths) | |
| return dict( | |
| observations=obs, | |
| actions=actions, | |
| rewards=rewards, | |
| advantages=advantages, | |
| ) | |
| def train(self): | |
| for i in range(1, self.n_iter+1): | |
| paths = [] | |
| for _ in range(self.n_episode): | |
| paths.append(self.sample_path()) | |
| data = self.process_paths(paths) | |
| loss = self.policy.train(data["observations"], data["actions"], data["advantages"]) | |
| avg_return = np.mean([sum(p["rewards"]) for p in paths]) | |
| print("Iteration {}: Loss = {}, Average Return = {}".format(i, loss, avg_return)) | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--n_iter', default=100, type=int, help='number of iterations') | |
| parser.add_argument('--n_episode', default=100, type=int, help='number of episodes/iteration') | |
| parser.add_argument('--path_length', default=200, type=int, help='number of steps') | |
| parser.add_argument('--learning_rate', default=0.01, help='learning rate for Adam Optimizer') | |
| parser.add_argument('--env', default='CartPole-v0', help='gym environment for training') | |
| parser.add_argument('--algorithm', default='VPG', help='algorithm identifier') | |
| parser.add_argument('--outdir', default='vpg', type=str, help='output directory where results are saved (/tmp/{outdir}-{env} )') | |
| parser.add_argument('--upload', action='store_true', help='upload results via OpenAI Gym API') | |
| parser.add_argument('--seed', default=0, type=int, help='random seed') | |
| args = parser.parse_args() | |
| np.random.seed(args.seed) | |
| tf.set_random_seed(args.seed) | |
| env = gym.make(args.env) | |
| outdir = '/tmp/' + args.outdir + '-' + args.env | |
| env.monitor.start(outdir, force=True) | |
| print("******* WILL SAVE RESULTS TO", outdir, " *******") | |
| sess = tf.Session() | |
| in_dim = flatten_space(env.observation_space) | |
| out_dim = flatten_space(env.action_space) | |
| hidden_dim = 8 | |
| opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate) | |
| policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess) | |
| po = PolicyOptimizer(env, policy, 0, args.n_iter, args.n_episode, args.path_length) | |
| sess.run(tf.initialize_all_variables()) | |
| # train the policy optimizer | |
| po.train() | |
| env.monitor.close() | |
| # make sure to setup your OPENAI_GYM_API_KEY environment variable | |
| if args.upload: | |
| gym.upload(outdir, algorithm_id=args.algorithm) |