Skip to content

Instantly share code, notes, and snippets.

@HansBouwmeester
Forked from Anjum48/ddpg_gym.py
Created May 22, 2017 22:07
Show Gist options
  • Save HansBouwmeester/70646935cd212dc629cbc84acbb98d3a to your computer and use it in GitHub Desktop.
Save HansBouwmeester/70646935cd212dc629cbc84acbb98d3a to your computer and use it in GitHub Desktop.
Pendulum-v0 submission using DDPG without batch normalisation
"""
Implementation of DDPG - Deep Deterministic Policy Gradient
Algorithm and hyperparameter details can be found here: http://arxiv.org/pdf/1509.02971v2.pdf
Variance scaling paper: https://arxiv.org/pdf/1502.01852v1.pdf
Thanks to GitHub users yanpanlau, pemami4911, songrotek and JunhongXu for their DDPG examples
Batch normalisation on the actor accelerates learning but has poor long term stability. Applying to the critic breaks
it, particularly on the state branch. Not sure why but I think this issue is specific to this environment
"""
import numpy as np
import tensorflow as tf
import tflearn
import random
import os
import pickle
import gym
from gym import wrappers
from time import time
from datetime import datetime
from collections import deque
from Forex.fxsettings import OUTPUT_RESULTS_DIR
# OUTPUT_RESULTS_DIR = os.pardir
# ==========================
# Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 1000
# Max episode length
MAX_EP_STEPS = 2000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.0001 # Paper uses 0.0001
# Base learning rate for the Critic Network
CRITIC_LEARNING_RATE = 0.001 # Paper uses 0.001
# L2 weight decay for Q
L2_DECAY = 0.01 # Paper uses 0.01
# Discount factor
GAMMA = 0.99 # Paper uses 0.99
# Soft target update param
TAU = 0.001 # Paper uses 0.001
# Exploration parameters
OU_MU = 0.0
OU_THETA = 0.15 # Paper uses 0.15
OU_SIGMA = 0.20 # Paper uses 0.20
TAU2 = 25
RESTORE_DATE = None
# RESTORE_DATE = "20170415-213317"
# ===========================
# Utility Parameters
# ===========================
# Directory for storing TensorBoard summary results
RANDOM_SEED = 1234
# Size of replay buffer
BUFFER_SIZE = 1000000
MINIBATCH_SIZE = 64
# ENVIRONMENT = 'Pendulum-v0'
# ENVIRONMENT = 'MountainCarContinuous-v0'
# ENVIRONMENT = 'SemisuperPendulumNoise-v0'
ENVIRONMENT = 'SemisuperPendulumRandom-v0'
if RESTORE_DATE is not None:
SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, 'tf_ddpg', "gym", ENVIRONMENT, RESTORE_DATE)
else:
TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "tf_ddpg", "gym", ENVIRONMENT, TIMESTAMP)
# ===========================
# Replay Buffer
# ===========================
class ReplayBuffer(object):
def __init__(self, buffer_size, random_seed=None):
"""
The right side of the deque contains the most recent experiences
The buffer stores a number of past experiences to stochastically sample from
"""
self.buffer_size = buffer_size
self.count = 0
self.buffer = deque(maxlen=self.buffer_size)
self.seed = random_seed
if self.seed is not None:
random.seed(self.seed)
def add(self, state, action, reward, t, s2):
experience = (state, action, reward, t, s2)
self.buffer.append(experience)
self.count += 1
def size(self):
return self.count
def sample_batch(self, batch_size):
if self.count < batch_size:
batch = random.sample(self.buffer, self.count)
else:
batch = random.sample(self.buffer, batch_size)
s_batch = np.array([_[0] for _ in batch])
a_batch = np.array([_[1] for _ in batch])
r_batch = np.array([_[2] for _ in batch]).reshape(batch_size, -1)
t_batch = np.array([_[3] for _ in batch]).reshape(batch_size, -1)
s2_batch = np.array([_[4] for _ in batch])
return s_batch, a_batch, r_batch, t_batch, s2_batch
def clear(self):
self.buffer.clear()
self.count = 0
# ===========================
# Ornstein-Uhlenbeck noise
# ===========================
class OUNoise:
"""docstring for OUNoise"""
def __init__(self, action_dimension, mu=0.0, theta=0.15, sigma=0.3, seed=123):
self.action_dimension = action_dimension
self.mu = mu
self.theta = theta
self.sigma = sigma
self.state = np.ones(self.action_dimension) * self.mu
self.reset()
np.random.seed(seed)
def reset(self):
self.state = np.ones(self.action_dimension) * self.mu
def noise(self):
x = self.state
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
self.state = x + dx
return self.state
# ===========================
# Actor and Critic DNNs
# ===========================
class ActorNetwork(object):
"""
Input to the network is the state, output is the action
under a deterministic policy.
The output layer activation is a tanh to keep the action
between -action_bound and action_bound
"""
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, restore=False):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.action_bound = action_bound
self.learning_rate = learning_rate
self.tau = tau
self.is_training = tf.placeholder(tf.bool, name='Actor_is_training')
if not restore:
# Actor network
self.inputs, self.outputs, self.scaled_outputs = self.create_actor_network()
self.net_params = tf.trainable_variables() # Returns a list of Variables where trainable=True
# Target network
self.target_inputs, self.target_outputs, self.target_scaled_outputs = self.create_actor_network("_target")
self.target_net_params = tf.trainable_variables()[len(self.net_params):]
# Temporary placeholder action gradient - this gradient will be provided by the critic network
self.action_gradients = tf.placeholder(tf.float32, [None, self.action_dim], name="actor_action_gradient")
# Combine dnetScaledOut/dnetParams with criticToActionGradient to get actorGradient
self.actor_gradients = tf.gradients(self.scaled_outputs, self.net_params, -self.action_gradients,
name="actor_gradient")
self.optimize = tf.train.AdamOptimizer(self.learning_rate, name='Adam_Actor'). \
apply_gradients(zip(self.actor_gradients, self.net_params))
tf.add_to_collection('Actor_action_gradients', self.action_gradients)
tf.add_to_collection('Actor_optimize', self.optimize)
else:
# Load Actor network
self.inputs, self.out, self.scaled_outputs = self.load_actor_network()
# Filter the loaded trainable variables for those belonging only to the actor network
self.net_params = [v for v in tf.trainable_variables() if "actor" in v.name and "target" not in v.name]
# Load Target network
self.target_inputs, self.target_outputs, self.target_scaled_outputs = self.load_actor_network(True)
# Filter the loaded trainable variables for those belonging only to the target actor network
self.target_net_params = [v for v in tf.trainable_variables() if "actor" in v.name and "target" in v.name]
self.action_gradients = tf.get_collection('Actor_action_gradients')[0]
self.optimize = tf.get_collection('Actor_optimize')[0]
# Op for periodically updating target network with online network weights
self.update_target_net_params = \
[self.target_net_params[i].assign(tf.multiply(self.net_params[i], self.tau) +
tf.multiply(self.target_net_params[i], 1. - self.tau))
for i in range(len(self.target_net_params))]
self.num_trainable_vars = len(self.net_params) + len(self.target_net_params)
def create_actor_network(self, suffix=""):
state = tflearn.input_data(shape=[None, self.state_dim], name='actor_input'+suffix)
# state_bn = tf.layers.batch_normalization(state, training=self.is_training, scale=False,
# name='actor_BN_input'+suffix)
net = tflearn.fully_connected(state, 400, activation='relu', name='actor_L1'+suffix,
weights_init=tflearn.initializations.variance_scaling(seed=RANDOM_SEED))
if suffix == "":
tf.summary.histogram("Actor/Layer1", net.W)
# net = tf.layers.batch_normalization(net, training=self.is_training, scale=False,
# name='actor_BN1'+suffix)
net = tflearn.fully_connected(net, 300, activation='relu', name='actor_L2'+suffix,
weights_init=tflearn.initializations.variance_scaling(seed=RANDOM_SEED))
if suffix == "":
tf.summary.histogram("Actor/Layer2", net.W)
# net = tf.layers.batch_normalization(net, training=self.is_training, scale=True,
# name='actor_BN2'+suffix)
# Final layer weights are initialized to Uniform[-3e-3, 3e-3]
weight_init_final = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
action = tflearn.fully_connected(net, self.action_dim, activation='tanh', weights_init=weight_init_final,
name='actor_output'+suffix)
# Scale output to [-action_bound, action_bound]
scaled_action = tf.multiply(action, self.action_bound, name='actor_output_scaled'+suffix)
return state, action, scaled_action
@staticmethod
def load_actor_network(target=False):
suffix = "_target" if target else ""
inputs = tf.get_default_graph().get_tensor_by_name("actor_input"+suffix+"/X:0")
out = tf.get_default_graph().get_tensor_by_name("actor_output"+suffix+"/Tanh:0")
scaled_out = tf.get_default_graph().get_tensor_by_name("actor_output_scaled"+suffix+":0")
return inputs, out, scaled_out
def train(self, inputs, action_gradients):
# Extra ops for BN. Parameters associated with the target network are ignored
extra_update_ops = [v for v in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if
"actor" in v.name and "target" not in v.name]
return self.sess.run([self.optimize, extra_update_ops],
feed_dict={self.inputs: inputs, self.action_gradients: action_gradients,
self.is_training: True})
def predict(self, inputs):
return self.sess.run(self.scaled_outputs, feed_dict={self.inputs: inputs,
self.is_training: False})
def predict_target(self, inputs):
return self.sess.run(self.target_scaled_outputs, feed_dict={self.target_inputs: inputs,
self.is_training: False})
def update_target_network(self):
self.sess.run(self.update_target_net_params, feed_dict={self.is_training: False})
def get_num_trainable_vars(self):
return self.num_trainable_vars
class CriticNetwork(object):
"""
Input to the network is the state and action, output is Q(s,a).
The action must be obtained from the output of the Actor network.
"""
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, restore=False):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.learning_rate = learning_rate
self.tau = tau
self.is_training = tf.placeholder(tf.bool, name='Critic_is_training')
if not restore:
# Create the Critic network
self.inputs, self.action, self.outputs = self.create_critic_network()
self.network_params = [v for v in tf.trainable_variables() if "critic" in v.name]
# Create the Target Network
self.target_inputs, self.target_action, self.target_outputs = self.create_critic_network("_target")
self.target_network_params = [v for v in tf.trainable_variables() if
"critic" in v.name and "target" in v.name]
# Network target (y_i) - Obtained from the target networks
self.q_value = tf.placeholder(tf.float32, [None, self.action_dim], name="critic_q_value")
self.L2 = tf.add_n([L2_DECAY * tf.nn.l2_loss(v) for v in self.network_params if "/W" in v.name])
self.loss = tf.losses.mean_squared_error(self.q_value, self.outputs) + self.L2
self.optimize = tf.train.AdamOptimizer(self.learning_rate, name='Adam_Critic').minimize(self.loss)
tf.add_to_collection('Critic_q_value', self.q_value)
tf.add_to_collection('Critic_loss', self.loss)
tf.add_to_collection('Critic_optimize', self.optimize)
else:
# Load the Critic network
self.inputs, self.action, self.outputs = self.load_critic_network()
# Filter the loaded trainable variables for those belonging only to the critic network
self.network_params = [v for v in tf.trainable_variables() if "critic" in v.name and "target" not in v.name]
# Load the Target Network
self.target_inputs, self.target_action, self.target_outputs = self.load_critic_network(True)
# Filter the loaded trainable variables for those belonging only to the target critic network
self.target_network_params = [v for v in tf.trainable_variables() if
"critic" in v.name and "target" in v.name]
self.predicted_q_value = tf.get_collection('Critic_predicted_q_value')[0]
self.L2 = tf.add_n([L2_DECAY * tf.nn.l2_loss(v) for v in self.network_params if "/W" in v.name])
self.loss = tf.get_collection('Critic_loss')[0] + self.L2
self.optimize = tf.get_collection('Critic_optimize')[0]
# Op for periodically updating target network with online network weights
self.update_target_net_params = \
[self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
tf.multiply(self.target_network_params[i], 1. - self.tau))
for i in range(len(self.target_network_params))]
# Get the gradient of the critic w.r.t. the action
self.action_grads = tf.gradients(self.outputs, self.action, name="critic_action_gradient")
tf.summary.scalar("L2", self.L2)
def create_critic_network(self, suffix=""):
# Critic breaks when BN is added to the state in Pendulum-v0. Not sure why :(
state = tflearn.input_data(shape=[None, self.state_dim], name="critic_input_state"+suffix)
# state_bn = tf.layers.batch_normalization(state, training=self.is_training, scale=False,
# name='critic_BN_input'+suffix)
action = tflearn.input_data(shape=[None, self.action_dim], name="critic_input_action"+suffix)
# action_bn = tf.layers.batch_normalization(action, training=self.is_training, scale=False,
# name='critic_BN_action'+suffix)
net = tflearn.fully_connected(state, 400, activation='relu', name='critic_L1'+suffix,
weights_init=tflearn.initializations.variance_scaling(seed=RANDOM_SEED))
if suffix == "":
tf.summary.histogram("Critic/Layer1", net.W)
# net = tf.layers.batch_normalization(net, training=self.is_training, scale=False,
# name='critic_BN1'+suffix)
# Add the action tensor in the 2nd hidden layer and create variables for W's and b
s_union = tflearn.fully_connected(net, 300, name="critic_L2_state" + suffix,
weights_init=tflearn.initializations.variance_scaling(seed=RANDOM_SEED))
a_union = tflearn.fully_connected(action, 300, name="critic_L2_action" + suffix,
weights_init=tflearn.initializations.variance_scaling(seed=RANDOM_SEED))
net = tf.nn.relu(tf.matmul(net, s_union.W) + tf.matmul(action, a_union.W) + s_union.b,
name='critic_L2' + suffix)
if suffix == "":
tf.summary.histogram("Critic/Layer2/state", s_union.W)
tf.summary.histogram("Critic/Layer2/action", a_union.W)
# Linear layer connected to action_dim outputs representing Q(s,a). Weights are init to Uniform[-3e-3, 3e-3]
weight_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
q_value = tflearn.fully_connected(net, self.action_dim, activation="linear",
weights_init=weight_init, name='critic_output'+suffix)
return state, action, q_value
@staticmethod
def load_critic_network(target=False):
suffix = "_target" if target else ""
inputs = tf.get_default_graph().get_tensor_by_name("critic_input_state"+suffix+"/X:0")
action = tf.get_default_graph().get_tensor_by_name("critic_input_action"+suffix+"/X:0")
out = tf.get_default_graph().get_tensor_by_name("critic_output"+suffix+"/BiasAdd:0")
return inputs, action, out
def train(self, inputs, action, target_q_value):
# Extra ops for BN. Parameters associated with the target network are ignored
extra_update_ops = [v for v in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if
"critic" in v.name and "target" not in v.name]
return self.sess.run([self.optimize, self.loss, extra_update_ops], feed_dict={
self.inputs: inputs,
self.action: action,
self.q_value: target_q_value,
self.is_training: True
})[:2]
def predict(self, inputs, action):
return self.sess.run(self.outputs, feed_dict={self.inputs: inputs, self.action: action,
self.is_training: False})
def predict_target(self, inputs, action):
return self.sess.run(self.target_outputs, feed_dict={self.target_inputs: inputs, self.target_action: action,
self.is_training: False})
def action_gradients(self, inputs, action):
return self.sess.run(self.action_grads, feed_dict={self.inputs: inputs, self.action: action,
self.is_training: False})
def update_target_network(self):
self.sess.run(self.update_target_net_params, feed_dict={self.is_training: False})
# ===========================
# TensorFlow Summary Ops
# ===========================
def add_histogram(writer, tag, values, step, bins=1000):
"""
Logs the histogram of a list/vector of values.
From: https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
"""
# Create histogram using numpy
counts, bin_edges = np.histogram(values, bins=bins)
# Fill fields of histogram proto
hist = tf.HistogramProto()
hist.min = float(np.min(values))
hist.max = float(np.max(values))
hist.num = int(np.prod(values.shape))
hist.sum = float(np.sum(values))
hist.sum_squares = float(np.sum(values ** 2))
# Requires equal number as bins, where the first goes from -DBL_MAX to bin_edges[1]
# See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto#L30
# Thus, we drop the start of the first bin
bin_edges = bin_edges[1:]
# Add bin edges and counts
for edge in bin_edges:
hist.bucket_limit.append(edge)
for c in counts:
hist.bucket.append(c)
# Create and write Summary
summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
writer.add_summary(summary, step)
# ===========================
# Agent Training
# ===========================
def train(sess, env, actor, critic, saver, replay_buffer):
"""
Train Actor and Critic networks and save checkpoints
:param sess: TensorFlow session
:param env: environment to be used for training
:param actor: Actor network
:param critic: Critic network
:param saver: TensorFlow saver object
:param replay_buffer: Replay buffer to store experience
:return:
"""
# Initialise variables
if RESTORE_DATE is None:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)
# Initialize target network weights & noise function
actor.update_target_network()
critic.update_target_network()
for i in range(MAX_EPISODES):
start = time()
ep_rewards = []
ep_q_rmse = []
ep_action_dist = []
ep_loss = []
env.seed(RANDOM_SEED + i)
s = env.reset()
exploration_noise = OUNoise(actor.action_dim, OU_MU, OU_THETA, OU_SIGMA, RANDOM_SEED + i)
for j in range(MAX_EP_STEPS):
# if i > 100:
env.render()
a = actor.predict(np.reshape(s, (1, actor.state_dim))) # Reshape state into a column
# Add exploration noise
if RESTORE_DATE is None:
epsilon = np.exp(-i/TAU2)
a += epsilon * exploration_noise.noise() / env.action_space.high
else:
epsilon = 0
# Step forward in the environment
a = np.clip(a, env.action_space.low, env.action_space.high)
s2, r, terminal, info = env.step(a[0])
ep_action_dist.append(a[0])
replay_buffer.add(np.reshape(s, (actor.state_dim,)), # Previous state
np.reshape(a, (actor.action_dim,)), # Action
r, # Reward
terminal, # Terminal state (bool)
np.reshape(s2, (actor.state_dim,))) # New state
# Keep adding experience to the memory until there are at least 50 episodes of samples before training
if replay_buffer.size() > MINIBATCH_SIZE:
s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE)
# Calculate target q
target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
ep_q_rmse.append(np.sqrt(np.mean((target_q - r_batch) ** 2, axis=0)))
y = r_batch + GAMMA * target_q * ~t_batch
# Update the critic given the targets
_, loss = critic.train(s_batch, a_batch, y)
# Update the actor policy using the sampled gradient
a_outs = actor.predict(s_batch)
grads = critic.action_gradients(s_batch, a_outs)
# grads = np.clip(grads, -1, 1) # Gradient clipping to prevent exploding gradients
actor.train(s_batch, grads[0])
# Update target networks
actor.update_target_network()
critic.update_target_network()
else:
loss = 0
s = s2 # new state is the output from this step
ep_rewards.append(r)
ep_loss.append(loss)
if terminal or j == MAX_EP_STEPS - 1:
# Add results to summaries
episode_summary = tf.Summary()
episode_summary.value.add(tag="Reward", simple_value=np.sum(ep_rewards))
episode_summary.value.add(tag="Q_RMSE", simple_value=np.mean(ep_q_rmse))
episode_summary.value.add(tag="Epsilon", simple_value=epsilon)
episode_summary.value.add(tag="Loss", simple_value=loss)
# Hack to add histograms
add_histogram(writer, "Actions", np.ravel(ep_action_dist), i)
add_histogram(writer, "Rewards", np.array(ep_rewards), i)
summary_str = sess.run(tf.summary.merge_all())
writer.add_summary(episode_summary, i)
writer.add_summary(summary_str, i)
writer.flush()
print('Reward: %.2f' % np.sum(ep_rewards), '\t Episode', i,
'\tQ RMSE: %.2f' % np.mean(ep_q_rmse),
'\tTime: %.1f' % (time() - start),
'\tEpsilon: %.3f' % epsilon,
'\tLoss: %.3f' % np.mean(ep_loss)),
exploration_noise.reset()
break
# Save model every 50 steps
if i % 50 == 0 and i != 0:
save_start = time()
save_path = saver.save(sess, os.path.join(SUMMARY_DIR, "ddpg_model"))
pickle.dump(replay_buffer, open(os.path.join(SUMMARY_DIR, "replay_buffer.pkl"), "wb"))
print("Model saved in %.1f" % (time() - save_start), "seconds. Path: %s" % save_path)
def main(_):
# Need to split actor & critic into different graphs/sessions to prevent serialisation errors
# See https://github.com/tflearn/tflearn/issues/381
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.05
with tf.Session(config=config) as sess:
env = gym.make(ENVIRONMENT)
env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT+'-experiment'), force=True)
tf.set_random_seed(RANDOM_SEED)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high
# Ensure action bound is symmetric
assert (env.action_space.high == -env.action_space.low)
# Restore networks and replay buffer from disk otherwise create new ones
if RESTORE_DATE is not None:
saver = tf.train.import_meta_graph(os.path.join(SUMMARY_DIR, "ddpg_model.meta"))
saver.restore(sess, os.path.join(SUMMARY_DIR, "ddpg_model"))
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, restore=True)
critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, restore=True)
# Initialise the uninitialised variables
uninitialized_vars = []
for var in tf.global_variables():
try:
sess.run(var)
except tf.errors.FailedPreconditionError:
uninitialized_vars.append(var)
sess.run(tf.variables_initializer(uninitialized_vars))
replay_buffer = pickle.load(open(os.path.join(SUMMARY_DIR, "replay_buffer.pkl"), "rb"))
print("Model restored from %s" % os.path.join(SUMMARY_DIR, "ddpg_model"))
else:
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU)
critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU)
saver = tf.train.Saver()
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
# Start training given a session, environment, actor & critic
train(sess, env, actor, critic, saver, replay_buffer)
env.close()
gym.upload(os.path.join(SUMMARY_DIR, ENVIRONMENT+'-experiment'), api_key='sk_fjbPP4esQeGGTPYRSmKnPA')
if __name__ == '__main__':
# Quick wrapper that handles flag parsing and then dispatches to your own main
# http://stackoverflow.com/questions/33703624/how-does-tf-app-run-work
tf.app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment