Last active
March 14, 2017 08:47
-
-
Save BartKeulen/4abcf03fce1f1f2c8f9d7d4bd257b304 to your computer and use it in GitHub Desktop.
DDPG algorithm according to https://arxiv.org/abs/1509.02971. Implementation from https://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html. Added Ornstein-Uhlenbeck process noise and exponential and -tanh noise decay.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Data structure for implementing actor network for DDPG algorithm | |
Algorithm and hyperparameter details can be found here: | |
http://arxiv.org/pdf/1509.02971v2.pdf | |
Original author: Patrick Emami | |
Author: Bart Keulen | |
""" | |
import tensorflow as tf | |
import tflearn | |
class ActorNetwork(object): | |
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau): | |
self.sess = sess | |
self.state_dim = state_dim | |
self.action_dim = action_dim | |
self.action_bound = action_bound | |
self.learning_rate = learning_rate | |
self.tau = tau | |
# Actor network | |
self.inputs, self.outputs, self.scaled_outputs = self.create_actor_network() | |
self.net_params = tf.trainable_variables() | |
# Target network | |
self.target_inputs, self.target_outputs, self.target_scaled_outputs = self.create_actor_network() | |
self.target_net_params = tf.trainable_variables()[len(self.net_params):] | |
# Op for periodically updating target network with online network weights | |
self.update_target_net_params = \ | |
[self.target_net_params[i].assign(tf.mul(self.net_params[i], self.tau) + | |
tf.mul(self.target_net_params[i], 1. - self.tau)) | |
for i in range(len(self.target_net_params))] | |
# Temporary placeholder action gradient | |
self.action_gradients = tf.placeholder(tf.float32, [None, self.action_dim]) | |
# Combine dnetScaledOut/dnetParams with criticToActionGradient to get actorGradient | |
self.actor_gradients = tf.gradients(self.scaled_outputs, self.net_params, -self.action_gradients) | |
# Optimization Op | |
self.optimize = tf.train.AdamOptimizer(self.learning_rate).\ | |
apply_gradients(zip(self.actor_gradients, self.net_params)) | |
self.num_trainable_vars = len(self.net_params) + len(self.target_net_params) | |
def create_actor_network(self): | |
inputs = tflearn.input_data(shape=[None, self.state_dim]) | |
net = tflearn.fully_connected(inputs, 400, activation='relu') | |
net = tflearn.fully_connected(net, 300, activation='relu') | |
# Final layer weight are initialized to Uniform[-3e-3, 3e-3] | |
weight_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) | |
outputs = tflearn.fully_connected(net, self.action_dim, activation='tanh', weights_init=weight_init) | |
scaled_outputs = tf.mul(outputs, self.action_bound) # Scale output to [-action_bound, action_bound] | |
return inputs, outputs, scaled_outputs | |
def train(self, inputs, action_gradients): | |
return self.sess.run(self.optimize, feed_dict={ | |
self.inputs: inputs, | |
self.action_gradients: action_gradients | |
}) | |
def predict(self, inputs): | |
return self.sess.run(self.scaled_outputs, feed_dict={ | |
self.inputs: inputs | |
}) | |
def predict_target(self, inputs): | |
return self.sess.run(self.target_scaled_outputs, feed_dict={ | |
self.target_inputs: inputs | |
}) | |
def update_target_network(self): | |
self.sess.run(self.update_target_net_params) | |
def get_num_trainable_vars(self): | |
return self.num_trainable_vars |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Data structure for implementing critic network for DDPG algorithm | |
Algorithm and hyperparameter details can be found here: | |
http://arxiv.org/pdf/1509.02971v2.pdf | |
Original author: Patrick Emami | |
Author: Bart Keulen | |
""" | |
import tensorflow as tf | |
import tflearn | |
class CriticNetwork(object): | |
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, num_actor_vars): | |
self.sess = sess | |
self.state_dim = state_dim | |
self.action_dim = action_dim | |
self.action_bound = action_bound | |
self.learning_rate = learning_rate | |
self.tau = tau | |
# Critic network | |
self.inputs, self.action, self.outputs = self.create_critic_network() | |
self.net_params = tf.trainable_variables()[num_actor_vars:] | |
# Target network | |
self.target_inputs, self.target_action, self.target_outputs = self.create_critic_network() | |
self.target_net_params = tf.trainable_variables()[len(self.net_params) + num_actor_vars:] | |
# Op for periodically updating target network with online network weights | |
self.update_target_net_params = \ | |
[self.target_net_params[i].assign(tf.mul(self.net_params[i], self.tau) + | |
tf.mul(self.target_net_params[i], 1. - self.tau)) | |
for i in range(len(self.target_net_params))] | |
# Network target (y_i) | |
# Obtained from the target networks | |
self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) | |
# Define loss and optimization Op | |
self.loss = tflearn.mean_square(self.predicted_q_value, self.outputs) | |
self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) | |
# Get the gradient of the critic w.r.t. the action | |
self.action_grads = tf.gradients(self.outputs, self.action) | |
def create_critic_network(self): | |
inputs = tflearn.input_data(shape=[None, self.state_dim]) | |
action = tflearn.input_data(shape=[None, self.action_dim]) | |
net = tflearn.fully_connected(inputs, 400, activation='relu') | |
# Add the action tensor in the 2nd hidden layer | |
# Use two temp layers to get the corresponding weights and biases | |
t1 = tflearn.fully_connected(net, 300) | |
t2 = tflearn.fully_connected(action, 300) | |
net = tflearn.activation(tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t2.b, activation='relu') | |
# Linear layer connected to 1 output representing Q(s,a) | |
# Weights are init to Uniform[-3e-3, 3e-3] | |
weight_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003) | |
outputs = tflearn.fully_connected(net, 1, weights_init=weight_init) | |
return inputs, action, outputs | |
def train(self, inputs, action, predicted_q_value): | |
return self.sess.run([self.outputs, self.optimize], feed_dict={ | |
self.inputs: inputs, | |
self.action: action, | |
self.predicted_q_value: predicted_q_value | |
}) | |
def predict(self, inputs, action): | |
return self.sess.run(self.outputs, feed_dict={ | |
self.inputs: inputs, | |
self.action: action | |
}) | |
def predict_target(self, inputs, action): | |
return self.sess.run(self.target_outputs, feed_dict={ | |
self.target_inputs: inputs, | |
self.target_action: action | |
}) | |
def action_gradients(self, inputs, action): | |
return self.sess.run(self.action_grads, feed_dict={ | |
self.inputs: inputs, | |
self.action: action | |
}) | |
def update_target_network(self): | |
self.sess.run(self.update_target_net_params) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Implementation of DDPG - Deep Deterministic Policy Gradient | |
Algorithm and hyperparameter details can be found here: | |
http://arxiv.org/pdf/1509.02971v2.pdf | |
The algorithm is tested on the Pendulum-v0 OpenAI gym task | |
and developed with tflearn + Tensorflow | |
Original author: Patrick Emami | |
Author: Bart Keulen | |
""" | |
import numpy as np | |
import datetime | |
import gym | |
from gym.wrappers import Monitor | |
import tensorflow as tf | |
from actor import ActorNetwork | |
from critic import CriticNetwork | |
from replaybuffer import ReplayBuffer | |
from explorationnoise import ExplorationNoise | |
# ================================ | |
# TRAINING PARAMETERS | |
# ================================ | |
# Learning rates actor and critic | |
ACTOR_LEARNING_RATE = 0.0001 | |
CRITIC_LEARNING_RATE = 0.001 | |
# Maximum number of episodes | |
MAX_EPISODES = 1000 | |
# Maximum number of steps per episode | |
MAX_STEPS_EPISODE = 500 | |
# Discount factor | |
GAMMA = 0.99 | |
# Soft target update parameter | |
TAU = 0.001 | |
# Size of replay buffer | |
BUFFER_SIZE = 1000000 | |
MINIBATCH_SIZE = 64 | |
# Exploration noise variables | |
NOISE_MEAN = 0 | |
NOISE_VAR = 1 | |
# Ornstein-Uhlenbeck variables | |
OU_THETA = 0.15 | |
OU_MU = 0. | |
OU_SIGMA = 0.3 | |
# Exploration duration | |
EXPLORATION_TIME = 200 | |
# ================================ | |
# UTILITY PARAMETERS | |
# ================================ | |
# Gym environment name | |
ENV_NAME = 'Pendulum-v0' | |
# ENV_NAME = 'MountainCarContinuous-v0' | |
# Render gym env during training | |
RENDER_ENV = False | |
# Use Gym Monitor | |
GYM_MONITOR_EN = True | |
# Upload results to openAI | |
UPLOAD_GYM_RESULTS = False | |
GYM_API_KEY = '..............' | |
# Directory for storing gym results | |
DATETIME = datetime.datetime.now().strftime('%Y%m%d%H%M%S') | |
MONITOR_DIR = './results/{}/{}/gym_ddpg'.format(ENV_NAME, DATETIME) | |
# Directory for storing tensorboard summary results | |
SUMMARY_DIR = './results/{}/{}/tf_ddpg'.format(ENV_NAME, DATETIME) | |
RANDOM_SEED = 1234 | |
# ================================ | |
# TENSORFLOW SUMMARY OPS | |
# ================================ | |
def build_summaries(): | |
episode_reward = tf.Variable(0.) | |
tf.summary.scalar('Reward', episode_reward) | |
episode_ave_max_q = tf.Variable(0.) | |
tf.summary.scalar('Qmax Value', episode_ave_max_q) | |
summary_vars = [episode_reward, episode_ave_max_q] | |
summary_ops = tf.summary.merge_all() | |
return summary_ops, summary_vars | |
# ================================ | |
# TRAIN AGENT | |
# ================================ | |
def train(sess, env, actor, critic): | |
# Set up summary ops | |
summary_ops, summary_vars = build_summaries() | |
# Initialize Tensorflow variables | |
sess.run(tf.global_variables_initializer()) | |
writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) | |
# Initialize target network weights | |
actor.update_target_network() | |
critic.update_target_network() | |
# Initialize replay memory | |
replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) | |
for i in xrange(MAX_EPISODES): | |
s = env.reset() | |
episode_reward = 0 | |
episode_ave_max_q = 0 | |
noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA, MAX_STEPS_EPISODE) | |
noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME) | |
for j in xrange(MAX_STEPS_EPISODE): | |
if RENDER_ENV: | |
env.render() | |
# Add exploratory noise according to Ornstein-Uhlenbeck process to action | |
# Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps | |
if i < EXPLORATION_TIME: | |
a = actor.predict(np.reshape(s, (1, env.observation_space.shape[0]))) + noise[j] | |
else: | |
a = actor.predict(np.reshape(s, (1, env.observation_space.shape[0]))) | |
s2, r, terminal, info = env.step(a[0]) | |
replay_buffer.add(np.reshape(s, actor.state_dim), | |
np.reshape(a, actor.action_dim), r, terminal, | |
np.reshape(s2, actor.state_dim)) | |
# Keep adding experience to the memory until | |
# there are at least minibatch size samples | |
if replay_buffer.size() > MINIBATCH_SIZE: | |
s_batch, a_batch, r_batch, t_batch, s2_batch = \ | |
replay_buffer.sample_batch(MINIBATCH_SIZE) | |
# Calculate targets | |
target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) | |
y_i = [] | |
for k in xrange(MINIBATCH_SIZE): | |
# If state is terminal assign reward only | |
if t_batch[k]: | |
y_i.append(r_batch[k]) | |
# Else assgin reward + net target Q | |
else: | |
y_i.append(r_batch[k] + GAMMA * target_q[k]) | |
# Update the critic given the targets | |
predicted_q_value, _ = \ | |
critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) | |
episode_ave_max_q += np.amax(predicted_q_value) | |
# Update the actor policy using the sampled gradient | |
a_outs = actor.predict(s_batch) | |
a_grads = critic.action_gradients(s_batch, a_outs) | |
actor.train(s_batch, a_grads[0]) | |
# Update target networks | |
actor.update_target_network() | |
critic.update_target_network() | |
s = s2 | |
episode_reward += r | |
if terminal or j == MAX_STEPS_EPISODE-1: | |
summary_str = sess.run(summary_ops, feed_dict={ | |
summary_vars[0]: episode_reward, | |
summary_vars[1]: episode_ave_max_q | |
}) | |
writer.add_summary(summary_str, i) | |
writer.flush() | |
print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \ | |
'| Qmax: %.4f' % (episode_ave_max_q / float(j)) | |
break | |
# ================================ | |
# MAIN | |
# ================================ | |
def main(_): | |
with tf.Session() as sess: | |
env = gym.make(ENV_NAME) | |
# np.random.seed(RANDOM_SEED) | |
tf.set_random_seed(RANDOM_SEED) | |
env.seed(RANDOM_SEED) | |
state_dim = env.observation_space.shape[0] | |
action_dim = env.action_space.shape[0] | |
action_bound = env.action_space.high | |
# Ensure action bound is symmetric | |
assert(env.action_space.high == -env.action_space.low) | |
actor = ActorNetwork(sess, state_dim, action_dim, action_bound, | |
ACTOR_LEARNING_RATE, TAU) | |
critic = CriticNetwork(sess, state_dim, action_dim, action_bound, | |
CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) | |
if GYM_MONITOR_EN: | |
if not RENDER_ENV: | |
env = Monitor(env, MONITOR_DIR, video_callable=False, force=True) | |
else: | |
env = Monitor(env, MONITOR_DIR, force=True) | |
train(sess, env, actor, critic) | |
if UPLOAD_GYM_RESULTS: | |
#gym.upload(MONITOR_DIR, api_key=GYM_API_KEY) | |
if __name__ == '__main__': | |
tf.app.run() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Package containing different types of exploration noise: | |
- White noise | |
- Ornstein-Uhlenbeck process | |
- Noise decay | |
Author: Bart Keulen | |
""" | |
import numpy as np | |
class ExplorationNoise(object): | |
# ================================ | |
# WHITE NOISE PROCESS | |
# ================================ | |
@staticmethod | |
def white_noise(mu, sigma, num_steps): | |
# Generate random noise with mean 0 and variance 1 | |
return np.random.normal(mu, sigma, num_steps) | |
# ================================ | |
# ORNSTEIN-UHLENBECK PROCESS | |
# ================================ | |
@staticmethod | |
def ou_noise(theta, mu, sigma, num_steps, dt=1.): | |
noise = np.zeros(num_steps) | |
# Generate random noise with mean 0 and variance 1 | |
white_noise = np.random.normal(0, 1, num_steps) | |
# Solve using Euler-Maruyama method | |
for i in xrange(1, num_steps): | |
noise[i] = noise[i - 1] + theta * (mu - noise[i - 1]) * \ | |
dt + sigma * np.sqrt(dt) * white_noise[i] | |
return noise | |
# ================================ | |
# EXPONENTIAL NOISE DECAY | |
# ================================ | |
@staticmethod | |
def exp_decay(noise, decay_end): | |
num_steps = noise.shape[0] | |
# Check if decay ends before end of noise sequence | |
assert(decay_end <= num_steps) | |
scaling = np.zeros(num_steps) | |
scaling[:decay_end] = 2. - np.exp(np.divide(np.linspace(1., decay_end, num=decay_end) * np.log(2.), decay_end)) | |
return np.multiply(noise, scaling) | |
# ================================ | |
# TANH NOISE DECAY | |
# ================================ | |
@staticmethod | |
def tanh_decay(noise, decay_start, decay_length): | |
num_steps = noise.shape[0] | |
# Check if decay ends before end of noise sequence | |
assert(decay_start + decay_length <= num_steps) | |
scaling = 0.5*(1. - np.tanh(4. / decay_length * np.subtract(np.linspace(1., num_steps, num_steps), | |
decay_start + decay_length/2.))) | |
return np.multiply(noise, scaling) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Data structure for implementing experience replay | |
Author: Patrick Emami | |
""" | |
from collections import deque | |
import random | |
import numpy as np | |
class ReplayBuffer(object): | |
def __init__(self, buffer_size, random_seed=1234): | |
self.buffer_size = buffer_size | |
self.count = 0 | |
# Right side of deque contains newest experience | |
self.buffer = deque() | |
random.seed(random_seed) | |
def add(self, s, a, r, t, s2): | |
experience = (s, a, r, t, s2) | |
if self.count < self.buffer_size: | |
self.buffer.append(experience) | |
self.count += 1 | |
else: | |
self.buffer.popleft() | |
self.buffer.append(experience) | |
def size(self): | |
return self.count | |
def sample_batch(self, batch_size): | |
batch = [] | |
if self.count < batch_size: | |
batch = random.sample(self.buffer, self.count) | |
else: | |
batch = random.sample(self.buffer, batch_size) | |
s_batch = np.array([_[0] for _ in batch]) | |
a_batch = np.array([_[1] for _ in batch]) | |
r_batch = np.array([_[2] for _ in batch]) | |
t_batch = np.array([_[3] for _ in batch]) | |
s2_batch = np.array([_[4] for _ in batch]) | |
return s_batch, a_batch, r_batch, t_batch, s2_batch | |
def clear(self): | |
self.buffer.clear() | |
self.count = 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment