Last active
May 25, 2017 05:48
-
-
Save BarclayII/e0854a609b5625f35206887745575d2b to your computer and use it in GitHub Desktop.
Berkeley CS 294 Homework 1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Code to load an expert policy and generate roll-out data for behavioral cloning. | |
Example usage: | |
python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ | |
--num_rollouts 20 | |
Author of this script and included expert policies: Jonathan Ho ([email protected]) | |
""" | |
import pickle | |
import tensorflow as tf | |
import numpy as np | |
import numpy.random as RNG | |
import tf_util | |
import gym | |
import load_policy | |
import tensorflow.contrib.keras as K | |
import h5py | |
def main(): | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('expert_policy_file', type=str) | |
parser.add_argument('envname', type=str) | |
parser.add_argument('input', type=str) | |
parser.add_argument('--render', action='store_true') | |
parser.add_argument("--max_timesteps", type=int) | |
parser.add_argument("--datacap", type=int, default=100000) | |
parser.add_argument('--num_rollouts', type=int, default=20, | |
help='Number of expert roll outs') | |
args = parser.parse_args() | |
print('loading and building student policy') | |
data = h5py.File(args.input) | |
x = np.copy(data['observations'][:args.datacap]) | |
a = np.copy(data['actions'][:args.datacap]) | |
data.close() | |
input_dim = x.shape[1] | |
output_dim = a.shape[2] | |
model = K.models.Sequential() | |
model.add(K.layers.Lambda(lambda x: x / 10, input_shape=(input_dim,))) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=output_dim)) | |
model.compile(loss='mean_squared_error', | |
optimizer='rmsprop') | |
def policy_fn(obs): | |
return model.predict(obs, batch_size=obs.shape[0]) | |
policy_fn_expert = load_policy.load_policy(args.expert_policy_file) | |
print('loaded and built') | |
with tf.Session(): | |
tf_util.initialize() | |
env = gym.make(args.envname) | |
max_steps = args.max_timesteps or env.spec.timestep_limit | |
returns = [] | |
observations = [] | |
actions = [] | |
for i in range(args.num_rollouts): | |
current_obs = [] | |
current_acts = [] | |
print('iter', i) | |
obs = env.reset() | |
done = False | |
totalr = 0. | |
steps = 0 | |
model.fit(x, a[:, 0], batch_size=200, epochs=5, shuffle='batch') | |
while not done: | |
action = policy_fn(obs[None,:]) | |
current_obs.append(obs) | |
current_acts.append(action) | |
obs, r, done, _ = env.step(action) | |
totalr += r | |
steps += 1 | |
if args.render: | |
env.render() | |
if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) | |
if steps >= max_steps: | |
break | |
returns.append(totalr) | |
idx = RNG.choice(args.datacap, len(current_obs)) | |
x[idx] = current_obs | |
a[idx, 0] = policy_fn_expert(current_obs) | |
observations.extend(current_obs) | |
actions.extend(current_acts) | |
print('returns', returns) | |
print('mean return', np.mean(returns)) | |
print('std of return', np.std(returns)) | |
expert_data = {'observations': np.array(observations), | |
'actions': np.array(actions)} | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Code to load an expert policy and generate roll-out data for behavioral cloning. | |
Example usage: | |
python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ | |
--num_rollouts 20 | |
Author of this script and included expert policies: Jonathan Ho ([email protected]) | |
""" | |
import pickle | |
import tensorflow as tf | |
import numpy as np | |
import tf_util | |
import gym | |
import load_policy | |
import tensorflow.contrib.keras as K | |
import h5py | |
def main(): | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('expert_policy_file', type=str) | |
parser.add_argument('envname', type=str) | |
parser.add_argument('input', type=str) | |
parser.add_argument('valid', type=str) | |
parser.add_argument('--render', action='store_true') | |
parser.add_argument("--max_timesteps", type=int) | |
parser.add_argument('--num_rollouts', type=int, default=20, | |
help='Number of expert roll outs') | |
args = parser.parse_args() | |
print('loading and building student policy') | |
data = h5py.File(args.input) | |
valid = h5py.File(args.valid) | |
x = data['observations'][:100000] | |
a = data['actions'][:100000] | |
xv = valid['observations'] | |
av = valid['actions'] | |
input_dim = x.shape[1] | |
output_dim = a.shape[2] | |
model = K.models.Sequential() | |
model.add(K.layers.Lambda(lambda x: x / 10, input_shape=(input_dim,))) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=100)) | |
model.add(K.layers.Activation('tanh')) | |
model.add(K.layers.Dense(units=output_dim)) | |
model.compile(loss='mean_squared_error', | |
optimizer='rmsprop') | |
def policy_fn(obs): | |
return model.predict(obs, batch_size=obs.shape[0]) | |
print('loaded and built') | |
with tf.Session(): | |
tf_util.initialize() | |
import gym | |
env = gym.make(args.envname) | |
max_steps = args.max_timesteps or env.spec.timestep_limit | |
returns = [] | |
observations = [] | |
actions = [] | |
model.fit(x, a[:, 0], batch_size=200, epochs=50, shuffle='batch', | |
callbacks=[K.callbacks.EarlyStopping(patience=10)], | |
validation_data=(xv, av[:, 0])) | |
for i in range(args.num_rollouts): | |
print('iter', i) | |
obs = env.reset() | |
done = False | |
totalr = 0. | |
steps = 0 | |
while not done: | |
action = policy_fn(obs[None,:]) | |
observations.append(obs) | |
actions.append(action) | |
obs, r, done, _ = env.step(action) | |
totalr += r | |
steps += 1 | |
if args.render: | |
env.render() | |
if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) | |
if steps >= max_steps: | |
break | |
returns.append(totalr) | |
print('returns', returns) | |
print('mean return', np.mean(returns)) | |
print('std of return', np.std(returns)) | |
expert_data = {'observations': np.array(observations), | |
'actions': np.array(actions)} | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment