Last active
January 5, 2017 21:13
-
-
Save griver/a51232dc41ce5ae3f77ad576bff16b43 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import tensorflow as tf | |
import numpy as np | |
import itertools | |
import tensorflow.contrib.layers as layers | |
from tqdm import trange | |
from gym.spaces import Discrete, Box | |
def get_traj(agent, env, max_episode_steps, render, deterministic_acts=False): | |
''' | |
Runs agent-environment loop for one whole episdoe (trajectory). | |
Returns dictionary of results. | |
''' | |
steps = itertools.count() if max_episode_steps is None else xrange(max_episode_steps) | |
obs = env.reset() | |
actions = [] | |
rewards = [] | |
observations = [] | |
for step in steps: | |
observations.append(obs) #obs_i | |
act = agent.act(obs, deterministic=deterministic_acts) | |
obs, r, done, _ = env.step(act) | |
actions.append(act) # act_i | |
rewards.append(r) # r_i a reward received after act_i in obs_i | |
if done: break | |
if render: env.render() | |
return { 'rewards': np.array(rewards), | |
'actions': np.array(actions), | |
'observations': np.array(observations) } | |
def total_discounted_returns(rewards, gamma): | |
''' | |
Given episode rewards, computes a vector y such that | |
y_i = r_i + gamma*r_{i+1} + gamma^2 * r_{i+2} + .. | |
''' | |
n = len(rewards) | |
result = np.zeros_like(rewards) | |
next_rew = 0. | |
for i in reversed(xrange(n)): | |
result[i] = rewards[i] + gamma*next_rew | |
next_rew = result[i] | |
return result | |
def baseline_total_return(returns_per_traj): | |
N = len(returns_per_traj) | |
maxlen = max(len(ret) for ret in returns_per_traj) | |
masked = np.full((N, maxlen), float('nan')) | |
for i in xrange(N): | |
masked[i,0:len(returns_per_traj[i])] = returns_per_traj[i] | |
masked = np.ma.array(masked, mask=np.isnan(masked)) | |
return masked.mean(axis=0) | |
class REINFORCEAgent(object): | |
''' | |
REINFORCE with baselines | |
Based on John Schulman's lectures(https://youtu.be/aUrX-rP_ss4) | |
''' | |
def __init__(self, obs_space, act_space, **user_params): | |
assert isinstance(act_space, Discrete), \ | |
'Agent works only with discrete action spaces' | |
self.input_shape = (None, ) + obs_space.shape | |
self.is_continious = isinstance(act_space, Box) | |
self.num_actions = act_space.n | |
self.curr_sess = None | |
self.config = dict( | |
max_episode_steps=500, #if None continue episode until terminal condition | |
steps_per_batch=10000, | |
n_iter=100, | |
gamma=1.0, | |
optim_config = {'lr':0.05, 'rho':0.9, 'eps':1e-9}, | |
num_hidden=20, | |
dtype=tf.float32, | |
scope_name='REINFORCE' | |
) | |
self._num_layers = 0 | |
self._check_config_args(user_params) | |
self.config.update(user_params) | |
dtype= self.config['dtype'] | |
with tf.variable_scope(self.config['scope_name']): | |
#advs stands for advantage value: total discounted returns - baseline | |
self.advs_pl = tf.placeholder(shape=[None,], dtype=dtype, name='rets') | |
self.acts_pl = tf.placeholder(shape=[None,], dtype=tf.int32, name='acts') | |
self.obs_pl = tf.placeholder(shape=self.input_shape, dtype=dtype, name='obs') | |
flatten_obs = layers.flatten(self.obs_pl) | |
fc1 = self._add_fc(flatten_obs, self.config['num_hidden'], activation=tf.nn.relu) | |
fc2 = self._add_fc(fc1, self.num_actions, activation=None) | |
self.action_probs = tf.nn.softmax(fc2) | |
one_hot_acts = layers.one_hot_encoding(self.acts_pl, self.num_actions) | |
selected_probs = tf.reduce_sum(self.action_probs * one_hot_acts, reduction_indices=1) | |
#tf can't compute gradients of gather_nd :( | |
#selected_probs = tf.gather_nd(self.action_probs, indices=self.id2acts_pl) | |
#conventional sgd-like updates goes in tries to minimize loss functions, | |
#so we need to add minus to our loss function to move the parameters in | |
# direction of the original "loss" ascend: | |
neg_logprob = -tf.log(selected_probs) | |
N = tf.shape(self.advs_pl)[0] | |
self.loss = tf.reduce_sum(tf.mul(neg_logprob, self.advs_pl)) / tf.to_float(N) | |
#create_optimizer: | |
opt_cfg = self.config['optim_config'] | |
self.optimizer = tf.train.RMSPropOptimizer(opt_cfg['lr'], | |
opt_cfg['rho'], | |
0.0, opt_cfg['eps']) | |
self.opt_step = self.optimizer.minimize(self.loss) | |
def _check_config_args(self, user_args): | |
unrecognized = set(user_args.keys()) - set(self.config.keys()) | |
if unrecognized: | |
raise ValueError('Unrecognized config params: {0}'.format(unrecognized)) | |
def _add_fc(self, inputs, num_units, activation=None): | |
return layers.fully_connected( | |
inputs, num_units, activation_fn=activation, | |
weights_initializer=self._xavier_init(), | |
biases_initializer=self._const_init() ) | |
def _xavier_init(self, factor=2.0): | |
return layers.initializers.variance_scaling_initializer( | |
factor=2.0, mode='FAN_IN', dtype=self.config['dtype'] | |
) | |
def _const_init(self, value=0.0): | |
return tf.constant_initializer(value, dtype=self.config['dtype']) | |
def act(self, observation, deterministic=False): | |
observation = observation[np.newaxis, :] #append new dimention to the left | |
probs = self.curr_sess.run(self.action_probs, feed_dict={self.obs_pl:observation}) | |
if not deterministic: | |
return np.random.choice(self.num_actions, p=probs[0]) | |
else: | |
return np.argmax(probs[0]) | |
def update(self, obs, acts, advs): | |
feeds = { | |
self.obs_pl:obs, | |
self.acts_pl:acts, | |
self.advs_pl:advs | |
} | |
self.curr_sess.run([self.opt_step], feed_dict=feeds) | |
def learn(self, sess, env , verbose=1): | |
config = self.config | |
self.curr_sess = sess | |
try: | |
for it in xrange(config['n_iter']): | |
# Collect trajectories unitl we get steps_per_batch total timesteps: | |
trajs = [] | |
total_steps = 0 | |
while total_steps < config['steps_per_batch']: | |
traj = get_traj(self, env, | |
config['max_episode_steps'], render=False) | |
trajs.append(traj) | |
total_steps += len(traj['actions']) | |
#compute advantages for all steps in all trajectories: | |
gamma = config['gamma'] | |
ret_per_traj = [total_discounted_returns(tr['rewards'],gamma) for tr in trajs] | |
baseline = np.array(baseline_total_return(ret_per_traj)) | |
all_advs = [ret - baseline[:len(ret)] for ret in ret_per_traj] | |
all_advs = np.concatenate(all_advs) #avds.shape = (sum(len(advs_fro_traj_i)), ) | |
all_obs = np.concatenate([traj['observations'] for traj in trajs]) | |
all_acts = np.concatenate([traj['actions'] for traj in trajs]) | |
#update agent parameters: | |
self.update(all_obs, all_acts, all_advs) | |
#print iteration stats: | |
ep_data = [(tr['rewards'].sum(), len(tr['rewards'])) for tr in trajs] | |
reward_per_ep, len_per_ep = zip(*ep_data) | |
if verbose > 0: | |
report_iteration_stats(it, reward_per_ep, len_per_ep) | |
if verbose > 1: | |
get_traj(self, env, config['max_episode_steps'], render=True) | |
finally: | |
if verbose > 1: | |
env.render(close=True) | |
self.curr_sess = None | |
def test(self, sess, env, num_episodes, max_episode_steps, | |
render=False, deterministic_acts=False): | |
self.curr_sess = sess | |
try: | |
num_steps = [] | |
rewards = [] | |
episodes = trange(0, num_episodes, desc='Episodes completed') | |
for ep in episodes: | |
traj = get_traj(self, env, max_episode_steps, | |
render=render, deterministic_acts=deterministic_acts) | |
rewards.append(sum(traj['rewards'])) | |
num_steps.append(len(traj['rewards'])) | |
finally: | |
if render: | |
env.render(close=True) | |
self.curr_sess = None | |
return {'num_steps':np.array(num_steps), 'rewards':np.array(rewards)} | |
def report_iteration_stats(it, reward_per_ep, len_per_ep): | |
print '=='*30 | |
print 'Iteration #{0}'.format(it) | |
print 'Num episodes:', len(reward_per_ep) | |
print 'Total timesteps:', sum(len_per_ep) | |
print 'Max episode R:', max(reward_per_ep) | |
print 'Mean ep reward:', np.mean(reward_per_ep), 'Var:', np.std(reward_per_ep) | |
print 'Mean ep steps:', np.mean(len_per_ep), 'Var:', np.std(len_per_ep) | |
print '=='*30 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym, os | |
import numpy as np | |
import tensorflow as tf | |
import reinforce_with_baseline as tf_reinforce | |
from gym import wrappers | |
def ensure_dir(d): | |
""" | |
Check if directories in d exists | |
if not creates corresponding directories. | |
""" | |
if not os.path.exists(d): | |
os.makedirs(d) | |
if __name__ == '__main__': | |
env = gym.make('Acrobot-v1') | |
user_params = { | |
'max_episode_steps':500, #env.spec.timestep_limit, | |
'optim_config':{'lr':0.005, 'rho':0.9, 'eps':1e-9}, | |
'num_hidden':40, | |
} | |
agent = tf_reinforce.REINFORCEAgent(env.observation_space, env.action_space, **user_params) | |
with tf.Session() as sess: | |
print '================ LEARNING =================' | |
sess.run(tf.initialize_all_variables()) | |
agent.learn(sess, env, verbose=2) | |
print '================ TESTING =================' | |
results = agent.test(sess, env, num_episodes=100, | |
max_episode_steps=agent.config['max_episode_steps'], | |
render=True, deterministic_acts = True) | |
print 'Mean episode reward:', np.mean(results['rewards']) | |
print 'Mean episode length:', np.mean(results['num_steps']) | |
env.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just run reproduce_gym_results.py