Skip to content

Instantly share code, notes, and snippets.

@BarclayII
Last active May 25, 2017 05:48
Show Gist options
  • Save BarclayII/e0854a609b5625f35206887745575d2b to your computer and use it in GitHub Desktop.
Save BarclayII/e0854a609b5625f35206887745575d2b to your computer and use it in GitHub Desktop.
Berkeley CS 294 Homework 1
#!/usr/bin/env python
"""
Code to load an expert policy and generate roll-out data for behavioral cloning.
Example usage:
python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
--num_rollouts 20
Author of this script and included expert policies: Jonathan Ho ([email protected])
"""
import pickle
import tensorflow as tf
import numpy as np
import numpy.random as RNG
import tf_util
import gym
import load_policy
import tensorflow.contrib.keras as K
import h5py
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str)
parser.add_argument('input', type=str)
parser.add_argument('--render', action='store_true')
parser.add_argument("--max_timesteps", type=int)
parser.add_argument("--datacap", type=int, default=100000)
parser.add_argument('--num_rollouts', type=int, default=20,
help='Number of expert roll outs')
args = parser.parse_args()
print('loading and building student policy')
data = h5py.File(args.input)
x = np.copy(data['observations'][:args.datacap])
a = np.copy(data['actions'][:args.datacap])
data.close()
input_dim = x.shape[1]
output_dim = a.shape[2]
model = K.models.Sequential()
model.add(K.layers.Lambda(lambda x: x / 10, input_shape=(input_dim,)))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=output_dim))
model.compile(loss='mean_squared_error',
optimizer='rmsprop')
def policy_fn(obs):
return model.predict(obs, batch_size=obs.shape[0])
policy_fn_expert = load_policy.load_policy(args.expert_policy_file)
print('loaded and built')
with tf.Session():
tf_util.initialize()
env = gym.make(args.envname)
max_steps = args.max_timesteps or env.spec.timestep_limit
returns = []
observations = []
actions = []
for i in range(args.num_rollouts):
current_obs = []
current_acts = []
print('iter', i)
obs = env.reset()
done = False
totalr = 0.
steps = 0
model.fit(x, a[:, 0], batch_size=200, epochs=5, shuffle='batch')
while not done:
action = policy_fn(obs[None,:])
current_obs.append(obs)
current_acts.append(action)
obs, r, done, _ = env.step(action)
totalr += r
steps += 1
if args.render:
env.render()
if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
if steps >= max_steps:
break
returns.append(totalr)
idx = RNG.choice(args.datacap, len(current_obs))
x[idx] = current_obs
a[idx, 0] = policy_fn_expert(current_obs)
observations.extend(current_obs)
actions.extend(current_acts)
print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))
expert_data = {'observations': np.array(observations),
'actions': np.array(actions)}
if __name__ == '__main__':
main()
#!/usr/bin/env python
"""
Code to load an expert policy and generate roll-out data for behavioral cloning.
Example usage:
python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
--num_rollouts 20
Author of this script and included expert policies: Jonathan Ho ([email protected])
"""
import pickle
import tensorflow as tf
import numpy as np
import tf_util
import gym
import load_policy
import tensorflow.contrib.keras as K
import h5py
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('expert_policy_file', type=str)
parser.add_argument('envname', type=str)
parser.add_argument('input', type=str)
parser.add_argument('valid', type=str)
parser.add_argument('--render', action='store_true')
parser.add_argument("--max_timesteps", type=int)
parser.add_argument('--num_rollouts', type=int, default=20,
help='Number of expert roll outs')
args = parser.parse_args()
print('loading and building student policy')
data = h5py.File(args.input)
valid = h5py.File(args.valid)
x = data['observations'][:100000]
a = data['actions'][:100000]
xv = valid['observations']
av = valid['actions']
input_dim = x.shape[1]
output_dim = a.shape[2]
model = K.models.Sequential()
model.add(K.layers.Lambda(lambda x: x / 10, input_shape=(input_dim,)))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=100))
model.add(K.layers.Activation('tanh'))
model.add(K.layers.Dense(units=output_dim))
model.compile(loss='mean_squared_error',
optimizer='rmsprop')
def policy_fn(obs):
return model.predict(obs, batch_size=obs.shape[0])
print('loaded and built')
with tf.Session():
tf_util.initialize()
import gym
env = gym.make(args.envname)
max_steps = args.max_timesteps or env.spec.timestep_limit
returns = []
observations = []
actions = []
model.fit(x, a[:, 0], batch_size=200, epochs=50, shuffle='batch',
callbacks=[K.callbacks.EarlyStopping(patience=10)],
validation_data=(xv, av[:, 0]))
for i in range(args.num_rollouts):
print('iter', i)
obs = env.reset()
done = False
totalr = 0.
steps = 0
while not done:
action = policy_fn(obs[None,:])
observations.append(obs)
actions.append(action)
obs, r, done, _ = env.step(action)
totalr += r
steps += 1
if args.render:
env.render()
if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
if steps >= max_steps:
break
returns.append(totalr)
print('returns', returns)
print('mean return', np.mean(returns))
print('std of return', np.std(returns))
expert_data = {'observations': np.array(observations),
'actions': np.array(actions)}
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment