Skip to content

Instantly share code, notes, and snippets.

@Davidnet
Created October 28, 2017 21:41
Show Gist options
  • Save Davidnet/0b300995090603e844b8905c53361ace to your computer and use it in GitHub Desktop.
Save Davidnet/0b300995090603e844b8905c53361ace to your computer and use it in GitHub Desktop.
ppo python
# # David Tensor Force Agent
# ## Tensorforce's Policy Proximity Optimizator
# Contains an implementation of PPO as described (https://arxiv.org/abs/1707.06347).
from docopt import docopt
import os
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import UnityEnvironment
import time
import shutil
_USAGE = '''
Usage:
ppo (<env>) [options]
Options:
--help Show this message.
--max-steps=<n> Maximum number of steps to run environment [default: 1e6].
--run-path=<path> The sub-directory name for model and summary statistics [default: ppo].
--load Whether to load the model or randomly initialize [default: False].
--train Whether to train model, or only run inference [default: True].
--summary-freq=<n> Frequency at which to save training statistics [default: 10000].
--save-freq=<n> Frequency at which to save model [default: 50000].
--gamma=<n> Reward discount rate [default: 0.99].
--lambd=<n> Lambda parameter for GAE [default: 0.95].
--time-horizon=<n> How many steps to collect per agent before adding to buffer [default: 2048].
--beta=<n> Strength of entropy regularization [default: 1e-3].
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
--epsilon=<n> Acceptable threshold around ratio of old and new policy probabilities [default: 0.2].
--buffer-size=<n> How large the experience buffer should be before gradient descent [default: 2048].
--learning-rate=<rate> Model learning rate [default: 3e-4].
--hidden-units=<n> Number of units in hidden layer [default: 64].
--batch-size=<n> How many experiences per gradient descent update step [default: 64].
--keep-checkpoints=<n> How many model checkpoints to keep [default: 5].
--worker-id=<n> Number to add to communication port (5005). Used for asynchronous agent scenarios [default: 0].
'''
options = docopt(_USAGE)
print(options)
# General parameters
max_steps = float(options['--max-steps'])
model_path = './models/{}'.format(str(options['--run-path']))
summary_path = './summaries/{}'.format(str(options['--run-path']))
load_model = options['--load']
train_model = options['--train']
summary_freq = int(options['--summary-freq'])
save_freq = int(options['--save-freq'])
env_name = options['<env>']
keep_checkpoints = int(options['--keep-checkpoints'])
worker_id = int(options['--worker-id'])
# Algorithm-specific parameters for tuning
gamma = float(options['--gamma'])
lambd = float(options['--lambd'])
time_horizon = int(options['--time-horizon'])
beta = float(options['--beta'])
num_epoch = int(options['--num-epoch'])
epsilon = float(options['--epsilon'])
buffer_size = int(options['--buffer-size'])
learning_rate = float(options['--learning-rate'])
hidden_units = int(options['--hidden-units'])
batch_size = int(options['--batch-size'])
env = UnityEnvironment(file_name=env_name, worker_id=worker_id)
print(str(env))
brain_name = env.brain_names[0]
tf.reset_default_graph()
# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
h_size=hidden_units, epsilon=epsilon,
beta=beta, max_step=max_steps)
is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)
if not os.path.exists(model_path):
os.makedirs(model_path)
if not os.path.exists(summary_path):
os.makedirs(summary_path)
def update_progress_bar(elapsed,remain,decimals=2, length=16, fill='█'):
clear_progress_bar()
percent = ("{0:." + str(decimals) + "f}").format(100 * (steps / float(max_steps)))
filledLength = int(length * steps // max_steps)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % ('', bar, percent, f" elapsed: {elapsed}, remaining: {remain}"), end='\r')
def clear_progress_bar():
w, h = shutil.get_terminal_size((80, 24))
print('\r' + ' ' * w, end='\r')
init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=keep_checkpoints)
timer = time.time()
last_progress_time = -1
with tf.Session() as sess:
# Instantiate model parameters
if load_model:
print('Loading Model...')
ckpt = tf.train.get_checkpoint_state(model_path)
saver.restore(sess, ckpt.model_checkpoint_path)
else:
sess.run(init)
steps = sess.run(ppo_model.global_step)
summary_writer = tf.summary.FileWriter(summary_path)
if "steps" in env._resetParameters:
config = {"steps": int(steps)}
else:
config = {}
info = env.reset(train_mode=train_model, config=config)[brain_name]
trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states)
# Make space for progress bar
print('\r')
while steps <= max_steps or not train_model:
if env.global_done:
if "steps" in env._resetParameters:
config = {"steps": int(steps)}
else:
config = {}
info = env.reset(train_mode=train_model, config=config)[brain_name]
# Decide and take an action
new_info = trainer.take_action(info, env, brain_name)
info = new_info
trainer.process_experiences(info, time_horizon, gamma, lambd)
if len(trainer.training_buffer['actions']) > buffer_size and train_model:
# Perform gradient descent with experience buffer
trainer.update_model(batch_size, num_epoch)
if steps % summary_freq == 0 and steps != 0 and train_model:
# Clear progress bar
clear_progress_bar()
# Write training statistics to tensorboard.
trainer.write_summary(summary_writer, steps)
if steps % save_freq == 0 and steps != 0 and train_model:
# Clear progress bar
clear_progress_bar()
# Save Tensorflow model
save_model(sess, model_path=model_path, steps=steps, saver=saver)
export_graph(model_path, env_name)
print('model saved.')
if time.time() - last_progress_time > 0.5 and steps != 0 and train_model:
last_progress_time = time.time()
m, s = divmod(time.time() - timer, 60)
h, m = divmod(m, 60)
el = f"{h:.0f}h{m:.0f}m{s:.0f}s"
m, s = divmod((time.time() - timer) / steps * (max_steps - steps), 60)
h, m = divmod(m, 60)
re = f"{h:.0f}h{m:.0f}m{s:.0f}s"
update_progress_bar(el,re)
steps += 1
sess.run(ppo_model.increment_step)
# Final save Tensorflow model
if steps != 0 and train_model:
save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(model_path, env_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment