Skip to content

Instantly share code, notes, and snippets.

@avalcarce
Created February 24, 2017 14:06
Show Gist options
  • Save avalcarce/842f4ee2c8954ee0b5879bb1b96fd75c to your computer and use it in GitHub Desktop.
Save avalcarce/842f4ee2c8954ee0b5879bb1b96fd75c to your computer and use it in GitHub Desktop.
Solving CartPole-v0 with DQN

Synopsis

This is a Deep Reinforcement Learning solution to the CartPole-v0 environment in OpenAI's Gym. This code uses Tensorflow to model a value function for a Reinforcement Learning agent. I've run it with Tensorflow 1.0 on Python 3.5 under Windows 7.

Some of the hyperparameters used in the main.py script have been optainedvia Bayesian optimization with Scikit-Optimize. The optimized hyperparameters and their values are:

  • Size of 1st fully connected layer: 208
  • Size of 2nd fully connected layer: 71
  • Learning rate: 1.09E-3
  • Period (in steps) for the update of the target network parameters as per the DQN algorithm: 800
  • Discount factor: 0.99
  • Whether to use Double DQN: False

References

  1. Deep Learning tutorial, David Silver, Google DeepMind.
  2. My code on Github
import numpy as np
class AgentEpsGreedy:
def __init__(self, n_actions, value_function_model, eps=1., summaries_path_current=None):
self.n_actions = n_actions
self.value_func = value_function_model
self.eps = eps
self.summaries_path_current = summaries_path_current
self.current_value = None # Current value of the value function (i.e. expected discounted return)
def act(self, state):
action_values = self.value_func.predict([state])[0]
policy = np.ones(self.n_actions) * self.eps / self.n_actions
a_max = np.argmax(action_values)
policy[a_max] += 1. - self.eps
a = np.random.choice(self.n_actions, p=policy)
self.current_value = action_values[a]
return a
def train(self, states, targets):
return self.value_func.train(states, targets)
def predict_q_values(self, states, use_old_params=False):
return self.value_func.predict(states, use_old_params)
import gym
from gym import wrappers
import copy
import time
from sys import platform
from textwrap import wrap
if platform == "linux" or platform == "linux2":
import matplotlib
matplotlib.use('Agg') # This is to generate images without having a window appear.
import matplotlib.pyplot as plt
from .utils import *
from .agents import AgentEpsGreedy
from .valuefunctions import ValueFunctionDQN
from .ReplayMemory import ReplayMemory
class ExperimentsManager:
def __init__(self, env_name, agent_value_function_hidden_layers_size, results_dir_prefix=None, summaries_path=None,
figures_dir=None, discount=0.99, decay_eps=0.995, eps_min=0.0001, learning_rate=1E-4, decay_lr=False,
max_step=10000, replay_memory_max_size=100000, ep_verbose=False, exp_verbose=True, batch_size=64,
upload_last_exp=False, double_dqn=False, target_params_update_period_steps=1, gym_api_key="",
gym_algorithm_id=None, checkpoints_dir=None, min_avg_rwd=-110):
self.env_name = env_name
self.results_dir_prefix = results_dir_prefix
self.gym_stats_dir = None
self.summaries_path = summaries_path
self.summaries_path_current = summaries_path
self.figures_dir = figures_dir
self.discount = discount
self.decay_eps = decay_eps
self.eps_min = eps_min
self.learning_rate = learning_rate
self.decay_lr = decay_lr
self.max_step = max_step
self.replay_memory_max_size = replay_memory_max_size
self.ep_verbose = ep_verbose # Whether or not to print progress during episodes
self.exp_verbose = exp_verbose # Whether or not to print progress during experiments
self.upload_last_exp = upload_last_exp
assert target_params_update_period_steps > 0, "The period for updating the target parameters must be positive."
self.target_params_update_period_steps = target_params_update_period_steps
self.gym_api_key = gym_api_key
self.gym_algorithm_id = gym_algorithm_id
self.checkpoints_dir = checkpoints_dir
self.checkpoints_dir_current = checkpoints_dir
self.agent = None
self.memory = None # Experience replay memory
self.batch_size = batch_size
self.agent_value_function_hidden_layers_size = agent_value_function_hidden_layers_size
self.double_dqn = double_dqn
self.global_step = 0 # Current step over all episodes
self.step = 0 # Current step per episode
self.ep = 0
self.exp = 0
self.step_durations_s = np.zeros(shape=self.max_step, dtype=float)
self.min_avg_rwd = min_avg_rwd # Minimum average reward to consider the problem as solved
self.n_avg_ep = 100 # Number of consecutive episodes to calculate the average reward
self.conf_msg = "\nEXECUTING EXPERIMENT {} OF {} IN ENVIRONMENT {}."
self.episode_progress_msg = "Step {:5d}/{:5d}. Avg step duration: {:3.6f} ms." + \
" Loss = {:3.2e}."
self.exp_progress_msg = "Exp {:3d}. Ep {:5d}, Rwd={:4.0f} (mean={:4.0f} over {:3d} episodes)." + \
" {} exceeded in {:4d} eps. Loss={:1.2e} (avg={:1.2e}). Agent epsilon={:3.2f} %." + \
" Average step duration: {:2.6f} ms."
self.exps_conf_str = ""
# Memory pre-allocation
self.Rwd_per_ep_v = np.zeros((1, 5000))
self.Loss_per_ep_v = np.zeros((1, 5000))
self.Avg_Rwd_per_ep = np.zeros((1, 5000))
self.Avg_Loss_per_ep = np.zeros((1, 5000))
self.n_eps_to_reach_min_avg_rwd = np.zeros(1, dtype=float)
self.Agent_Epsilon_per_ep = np.zeros((1, 5000))
self.agent_value_function = np.zeros((1, 1, self.max_step))
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
def __print_episode_progress(self, loss_v):
if self.ep_verbose:
if self.step > 0 and (self.step+1) % 20 == 0:
print(self.episode_progress_msg.format(self.step, self.max_step,
np.mean(self.step_durations_s[self.ep, 0:self.step]) * 1000,
loss_v))
def __double_dqn_train(self):
# DQN Experience Replay
loss_v = 0
if len(self.memory.memory) > self.batch_size:
# Extract a batch of random transitions from the replay memory
states_b, actions_b, rewards_b, states_n_b, done_b = zip(*self.memory.sample(self.batch_size))
states_b = np.array(states_b)
actions_b = np.array(actions_b)
rewards_b = np.array(rewards_b)
states_n_b = np.array(states_n_b)
done_b = np.array(done_b).astype(int)
q_n_b = self.agent.predict_q_values(states_n_b) # Action values on the arriving state
best_a = np.argmax(q_n_b, axis=1)
q_n_target_b = self.agent.predict_q_values(states_n_b, use_old_params=True)
targets_b = rewards_b + (1. - done_b) * self.discount * q_n_target_b[np.arange(self.batch_size), best_a]
targets = self.agent.predict_q_values(states_b)
for j, action in enumerate(actions_b):
targets[j, action] = targets_b[j]
loss_v = self.agent.train(states_b, targets)
return loss_v
def __train_on_experience(self):
# DQN Experience Replay
loss_v = 0
if len(self.memory.memory) > self.batch_size:
# Extract a batch of random transitions from the replay memory
states_b, actions_b, rewards_b, states_n_b, done_b = zip(*self.memory.sample(self.batch_size))
states_b = np.array(states_b)
actions_b = np.array(actions_b)
rewards_b = np.array(rewards_b)
states_n_b = np.array(states_n_b)
done_b = np.array(done_b).astype(int)
if self.target_params_update_period_steps == 1: # This is avoid having to copy the old params every step
q_n_b = self.agent.predict_q_values(states_n_b) # Action values on the next state
else:
q_n_b = self.agent.predict_q_values(states_n_b, use_old_params=True) # Action values on the next state
targets_b = rewards_b + (1. - done_b) * self.discount * np.amax(q_n_b, axis=1)
targets = self.agent.predict_q_values(states_b)
for j, action in enumerate(actions_b):
targets[j, action] = targets_b[j]
loss_v = self.agent.train(states_b, targets)
return loss_v
def __print_experiment_progress(self):
if self.exp_verbose:
rwd = self.Rwd_per_ep_v[self.exp, self.ep]
avg_rwd = self.Avg_Rwd_per_ep[self.exp, self.ep]
loss = self.Loss_per_ep_v[self.exp, self.ep]
avg_loss = self.Avg_Loss_per_ep[self.exp, self.ep]
avg_rwds = self.Avg_Rwd_per_ep[self.exp, 0:self.ep+1]
i_last_low_rwd = np.max(np.where(avg_rwds < self.min_avg_rwd))
n_solved_eps = self.ep - i_last_low_rwd
duration_ms = 0
if self.ep > 0:
duration_ms = np.mean(self.step_durations_s[0:self.ep, :]) * 1000
print(
self.exp_progress_msg.format(self.exp, self.ep, rwd, avg_rwd, self.n_avg_ep, self.min_avg_rwd,
n_solved_eps, loss, avg_loss, self.agent.eps*100, duration_ms))
def run_episode(self, env, train=True):
state = env.reset()
done = False
total_reward = 0
loss_v = 0
for self.step in range(self.max_step):
# Maybe update the target estimator
if self.target_params_update_period_steps > 1:
if self.global_step % self.target_params_update_period_steps == 0:
self.agent.value_func.update_old_params()
if self.ep_verbose:
print("Copied model parameters to target network.")
t = time.time()
self.__print_episode_progress(loss_v)
if done:
break
action = self.agent.act(state)
self.agent_value_function[self.exp, self.ep, self.step] = self.agent.current_value
self.global_step += 1
state_next, reward, done, info = env.step(action)
total_reward += reward
if self.memory is not None:
self.memory.add((state, action, reward, state_next, done))
if train:
if self.double_dqn:
loss_v = self.__double_dqn_train()
else:
loss_v = self.__train_on_experience()
else:
raise NotImplementedError("Please provide an Experience Replay memory")
state = copy.copy(state_next)
self.step_durations_s[self.ep, self.step] = time.time() - t # Time elapsed during this step
return loss_v, total_reward
def run_experiment(self, env, n_ep, stop_training_min_avg_rwd=None):
self.global_step = 0
train = True
# One experiment is composed of n_ep sequential episodes
for self.ep in range(n_ep):
loss_v, total_reward = self.run_episode(env, train)
# Collect episode results
self.Rwd_per_ep_v[self.exp, self.ep] = total_reward
self.Loss_per_ep_v[self.exp, self.ep] = loss_v
# Calculate episode statistics
last_rwds = self.Rwd_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
last_losses = self.Loss_per_ep_v[self.exp, np.maximum(self.ep - (self.n_avg_ep - 1), 0):self.ep+1]
self.Avg_Rwd_per_ep[self.exp, self.ep] = np.mean(last_rwds)
self.Avg_Loss_per_ep[self.exp, self.ep] = np.mean(last_losses)
self.Agent_Epsilon_per_ep[self.exp, self.ep] = self.agent.eps
if stop_training_min_avg_rwd is not None:
if self.Avg_Rwd_per_ep[self.exp, self.ep] >= stop_training_min_avg_rwd:
train = False
print("Minimum average reward reached. Stopping training.")
if self.Avg_Rwd_per_ep[self.exp, self.ep] >= self.min_avg_rwd:
self.n_eps_to_reach_min_avg_rwd[self.exp] = np.minimum(self.ep,
self.n_eps_to_reach_min_avg_rwd[self.exp])
if self.agent.eps > self.eps_min:
self.agent.eps *= self.decay_eps
self.__print_experiment_progress()
def __create_gym_stats_directory(self, env):
if self.results_dir_prefix is None:
raise ValueError("A prefix for the Gym results directory must be provided.")
if not os.path.exists(self.results_dir_prefix):
os.makedirs(self.results_dir_prefix)
t = get_last_folder_id(self.results_dir_prefix) + 1 # Calculate next test id
self.gym_stats_dir = os.path.join(self.results_dir_prefix, str(t).zfill(4))
if not os.path.exists(self.gym_stats_dir):
os.makedirs(self.gym_stats_dir)
else:
raise FileExistsError(self.gym_stats_dir)
return wrappers.Monitor(env, self.gym_stats_dir)
def __build_experiments_conf_str(self, n_exps, n_ep, n_actions, state_dim):
layers_size = str(state_dim)
for s in self.agent_value_function_hidden_layers_size:
layers_size += "-"+str(s)
layers_size += "-"+str(n_actions)
exp_conf_str = "{}_{}_Disc{:1.3e}_DecE{:1.2e}_EMin{:1.2e}_LR{:1.2e}_DecLR{}_MaxStp{}_" +\
"DDQN{}_RepMm{:1.1e}_BS{}_NEx{}_NEp{}_PmsUp{}"
self.exps_conf_str = exp_conf_str.format(time.strftime("%Y_%m_%d__%H_%M_%S"), layers_size, self.discount,
self.decay_eps, self.eps_min, self.learning_rate,
1 if self.decay_lr else 0, self.max_step, 1 if self.double_dqn else 0,
self.replay_memory_max_size, self.batch_size, n_exps, n_ep,
self.target_params_update_period_steps)
def __create_figures_directory(self):
if self.figures_dir is not None:
self.figures_dir = os.path.join(self.figures_dir, self.env_name, self.exps_conf_str)
if not os.path.exists(self.figures_dir):
os.makedirs(self.figures_dir)
else:
for dirpath, dirnames, files in os.walk(self.figures_dir):
if files:
raise FileExistsError("The figures directory exists and has files: {}".format(self.figures_dir))
else:
break
def run_experiments(self, n_exps, n_ep, stop_training_min_avg_rwd=None, plot_results=True):
self.Rwd_per_ep_v = np.zeros((n_exps, n_ep))
self.Loss_per_ep_v = np.zeros((n_exps, n_ep))
self.Avg_Rwd_per_ep = np.zeros((n_exps, n_ep))
self.n_eps_to_reach_min_avg_rwd = np.zeros(n_exps, dtype=float)
self.n_eps_to_reach_min_avg_rwd.fill(n_ep)
self.Avg_Loss_per_ep = np.zeros((n_exps, n_ep))
self.Agent_Epsilon_per_ep = np.zeros((n_exps, n_ep))
self.agent_value_function = np.zeros((n_exps, n_ep, self.max_step))
self.step_durations_s = np.zeros(shape=(n_ep, self.max_step), dtype=float)
# Create environment
env = gym.make(self.env_name)
n_actions = env.action_space.n
state_dim = env.observation_space.high.shape[0]
self.__build_experiments_conf_str(n_exps, n_ep, n_actions, state_dim)
self.__create_figures_directory()
for self.exp in range(n_exps):
print(self.conf_msg.format(self.exp, n_exps, self.env_name))
print(self.exps_conf_str)
env = gym.make(self.env_name) # Create new environment
assert n_actions == env.action_space.n
assert state_dim == env.observation_space.high.shape[0]
if self.upload_last_exp and self.exp == n_exps-1:
env = self.__create_gym_stats_directory(env)
if self.summaries_path is not None:
self.summaries_path_current = os.path.join(self.summaries_path,
self.env_name,
self.exps_conf_str + "_Exp" + str(self.exp))
if self.checkpoints_dir is not None:
self.checkpoints_dir_current = os.path.join(self.checkpoints_dir,
self.env_name,
self.exps_conf_str+"_Exp"+str(self.exp))
if not os.path.exists(self.checkpoints_dir_current):
os.makedirs(self.checkpoints_dir_current)
# Create agent
value_function = ValueFunctionDQN(scope="q", state_dim=state_dim, n_actions=n_actions,
train_batch_size=self.batch_size, learning_rate=self.learning_rate,
hidden_layers_size=self.agent_value_function_hidden_layers_size,
decay_lr=self.decay_lr, huber_loss=False,
summaries_path=self.summaries_path_current,
reset_default_graph=True,
checkpoints_dir=self.checkpoints_dir_current)
self.agent = AgentEpsGreedy(n_actions=n_actions, value_function_model=value_function, eps=0.9,
summaries_path_current=self.summaries_path_current)
self.memory = ReplayMemory(max_size=self.replay_memory_max_size)
self.run_experiment(env, n_ep, stop_training_min_avg_rwd) # This is where the action happens
value_function.close_summary_file()
env.close()
if self.upload_last_exp and self.exp == n_exps - 1:
print("Trying to upload results to the scoreboard.")
gym.upload(self.gym_stats_dir, api_key=self.gym_api_key, algorithm_id=self.gym_algorithm_id)
# Plot results
self.plot_rwd_loss()
self.plot_value_function()
self.print_experiment_summary()
self.calculate_avg_rwd()
self.plot_rwd_averages(n_exps)
if plot_results:
plt.show()
# Return the final Rwd averaged over all experiments AND the mean number of episodes needed to reach the min Rwd
return self.rwd_exps_avg_ma[-1], np.mean(self.n_eps_to_reach_min_avg_rwd)
def print_experiment_summary(self):
duration_ms = np.mean(self.step_durations_s) * 1000
print("Average step duration: {:2.6f} ms".format(duration_ms))
def calculate_avg_rwd(self):
self.rwd_exps_avg = np.mean(self.Rwd_per_ep_v, axis=0) # Rwd averaged over all experiments
self.rwd_exps_avg_ma = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_movstd = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile5 = np.zeros(self.rwd_exps_avg.shape[0])
self.rwd_exps_avg_percentile95 = np.zeros(self.rwd_exps_avg.shape[0])
for s in range(self.rwd_exps_avg.shape[0]):
self.rwd_exps_avg_ma[s] = np.mean(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_movstd[s] = np.std(self.rwd_exps_avg[max(0, s - 99):s + 1])
self.rwd_exps_avg_percentile5[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 5)
self.rwd_exps_avg_percentile95[s] = np.percentile(self.rwd_exps_avg[max(0, s - 99):s + 1], 95)
def plot_rwd_averages(self, n_exps):
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
if self.figures_dir is not None:
# PLOT ALL EXPERIMENTS
fig = plt.figure()
for i in range(n_exps):
plt.plot(eps, self.Avg_Rwd_per_ep[i, :], label="Exp {}".format(i))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward")
plt.grid(True)
plt.legend(loc='upper left')
ttl = "Average reward. " + self.exps_conf_str
plt.title("\n".join(wrap(ttl, 60)))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "RwdsComparisonsAcrossExps.png")
plt.savefig(fig_savepath)
plt.close(fig)
# PLOT AVERAGE OVER ALL EXPERIMENTS
fig = plt.figure()
plt.subplot(211)
plt.plot(eps, self.rwd_exps_avg, label="Average over {:3d} experiments".format(n_exps))
# plt.ylim([-self.max_step - 10, -70])
plt.ylabel("Reward per episode")
plt.grid(True)
plt.plot(eps, self.rwd_exps_avg_percentile95, label="95th percentile over 100 episodes")
plt.plot(eps, self.rwd_exps_avg_ma, label="100-episode moving average")
plt.plot(eps, self.rwd_exps_avg_percentile5, label="5th percentile over 100 episodes")
plt.legend(loc='lower right')
print("Average final reward: {:3.2f} (std={:3.2f}).\n".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
plt.title("Final average reward: {:3.2f} (std={:3.2f})".format(self.rwd_exps_avg_ma[-1],
self.rwd_exps_avg_movstd[-1]))
loss_exps_avg = np.mean(self.Loss_per_ep_v, axis=0)
plt.subplot(212)
plt.semilogy(eps, loss_exps_avg, label="Average over {:3d} experiments".format(n_exps))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
loss_exps_avg_ma = np.zeros(loss_exps_avg.shape[0])
for s in range(loss_exps_avg.shape[0]):
loss_exps_avg_ma[s] = np.mean(loss_exps_avg[max(0, s - 100):s + 1])
plt.plot(eps, loss_exps_avg_ma, label="100-episode moving average")
plt.legend(loc='lower right')
plt.suptitle("\n".join(wrap(self.exps_conf_str, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "ExpsAverage.png")
plt.savefig(fig_savepath)
plt.close(fig)
def plot_value_function(self):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
fig = plt.figure()
for ep in draw_equispaced_items_from_sequence(7, n_ep):
plt.plot(self.agent_value_function[self.exp, ep, :], label="Episode {:4d}".format(ep))
plt.xlabel("Steps")
plt.ylabel("Value")
plt.grid(True)
plt.legend(loc='lower right')
plt.title("Value functions for experiment {:2d}".format(self.exp))
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_ValueFunctions.png".format(self.exp))
plt.savefig(fig_savepath)
plt.close(fig)
def plot_rwd_loss(self):
if self.figures_dir is not None:
n_ep = self.Rwd_per_ep_v.shape[1]
eps = range(n_ep)
fig = plt.figure()
ax1 = plt.subplot(211)
plt.plot(eps, self.Rwd_per_ep_v[self.exp, :], label="Instantaneous")
plt.plot(eps, self.Avg_Rwd_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
# plt.ylim([-self.max_step - 10, -70])
plt.xlabel("Episode number")
plt.ylabel("Reward per episode")
ax2 = ax1.twinx()
plt.plot(eps, self.Agent_Epsilon_per_ep[self.exp, :], label="Agent epsilon", color='r')
ax2.set_ylabel(r'Agent $\varepsilon$', color='r')
ax2.tick_params('y', colors='r')
plt.grid(True)
ttl = "Final average reward: {:3.2f} (SD={:3.2f})"
plt.title(ttl.format(self.Avg_Rwd_per_ep[self.exp, -1], np.std(self.Rwd_per_ep_v[self.exp, n_ep-100:n_ep-1])))
plt.legend(loc='lower right')
rwd_per_ep_exp_avg = np.mean(self.Rwd_per_ep_v[0:self.exp+1, n_ep-100:n_ep-1], axis=1)
print("Final mean reward, averaged over {} experiment{}: {} (std = {}).".format(self.exp+1,
's' if self.exp > 0 else '',
np.mean(rwd_per_ep_exp_avg),
np.std(rwd_per_ep_exp_avg)))
plt.subplot(212)
plt.semilogy(eps, self.Loss_per_ep_v[self.exp, :], label="Instantaneous")
plt.semilogy(eps, self.Avg_Loss_per_ep[self.exp, :], label="Mean over {} eps".format(self.n_avg_ep))
plt.xlabel("Episode number")
plt.ylabel("Loss per episode")
plt.grid(True)
plt.title("Value function loss")
plt.legend(loc='lower right')
sttl = self.exps_conf_str + ". Experiment {}".format(self.exp)
plt.suptitle("\n".join(wrap(sttl, 60)))
plt.tight_layout()
plt.subplots_adjust(top=0.85)
if self.figures_dir is not None:
fig_savepath = os.path.join(self.figures_dir, "Experiment{}_Rwd_Loss.png".format(self.exp))
plt.savefig(fig_savepath)
plt.close(fig)
import os
from openai_playground.gymhelpers import ExperimentsManager
env_name = "CartPole-v0"
gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
alg_id = '###'
n_ep = 1500
n_exps = 1
hidden_layers_size = [208, 771]
expsman = ExperimentsManager(env_name=env_name, results_dir_prefix=gym_stats_dir_prefix,
agent_value_function_hidden_layers_size=hidden_layers_size, figures_dir=figures_dir,
discount=0.99, decay_eps=0.99, eps_min=7.40e-05,
learning_rate=1.09e-03, decay_lr=False, max_step=200, replay_memory_max_size=100000,
ep_verbose=False, exp_verbose=True, batch_size=64, upload_last_exp=False, double_dqn=False,
target_params_update_period_steps=800, gym_api_key=api_key, gym_algorithm_id=alg_id,
min_avg_rwd=195)
expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=215, plot_results=False)
input("Press Enter to terminate.")
from collections import deque
import numpy as np
class ReplayMemory:
def __init__(self, max_size=128):
self.memory = deque(maxlen=max_size)
def sample(self, batch_size):
batch_size = min(len(self.memory), batch_size)
idxs = np.random.choice(len(self.memory), batch_size)
return [self.memory[idx] for idx in idxs]
def add(self, item):
self.memory.append(item)
import os
import numpy as np
def get_last_folder_id(folder_path):
t = 0
for fn in os.listdir(folder_path):
t = max(t, int(fn))
return t
def movingaverage(values, window):
weights = np.repeat(1.0, window)/window
sma = np.convolve(values, weights, 'valid')
return sma
def draw_equispaced_items_from_sequence(m, n):
"""
draw_equispaced_items_from_sequence(m, n)
Args:
m (int): How many items to draw.
n (int): Length of sequence to draw from.
"""
return [i * n // m + n // (2 * m) for i in range(m)]
import os
import tensorflow as tf
class ValueFunctionDQN:
def __init__(self, scope="MyValueFunctionEstimator", state_dim=2, n_actions=3, train_batch_size=64,
learning_rate=1e-4, hidden_layers_size=None, decay_lr=False, huber_loss=False, summaries_path=None,
reset_default_graph=False, checkpoints_dir=None):
# Input check
if hidden_layers_size is None:
hidden_layers_size = [128, 64] # Default ANN architecture
assert len(hidden_layers_size) >= 1, "At least one hidden layer must be specified."
# Support variables
self.scope = scope
self.layers_size = [state_dim] + hidden_layers_size + [n_actions] # Size of all layers (including in & out)
self.weights = []
self.biases = []
self.weights_old = []
self.biases_old = []
self.learning_rate = learning_rate
self.train_batch_size = train_batch_size
self.n_train_epochs = 0
self.summaries_path = summaries_path
self.train_writer = None
self.checkpoints_dir = checkpoints_dir
if reset_default_graph:
tf.reset_default_graph()
# Build Tensorflow graph
with tf.variable_scope(self.scope):
# Inputs, weights, biases and targets of the ANN
self.x = tf.placeholder(tf.float32, shape=(None, state_dim), name="x")
self.train_targets = tf.placeholder(tf.float32, shape=(None, n_actions), name="train_targets")
for l in range(len(self.layers_size) - 1):
self.weights.append(tf.get_variable(name="w" + str(l), shape=[self.layers_size[l],
self.layers_size[l + 1]],
initializer=tf.contrib.layers.xavier_initializer()))
self.biases.append(tf.get_variable(name="b" + str(l), shape=[self.layers_size[l + 1]],
initializer=tf.constant_initializer(0.0)))
self.weights_old.append(tf.get_variable(name="w-" + str(l), shape=[self.layers_size[l],
self.layers_size[l + 1]],
initializer=tf.contrib.layers.xavier_initializer()))
self.biases_old.append(tf.get_variable(name="b-" + str(l), shape=[self.layers_size[l + 1]],
initializer=tf.constant_initializer(0.0)))
if summaries_path is not None:
with tf.name_scope('params_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(self.weights[l], "w" + str(l), histogram=True)
self.variable_summaries(self.biases[l], "b" + str(l), histogram=True)
# Interconnection of the various ANN nodes
self.prediction = self.model(self.x)
self.prediction_with_old_params = self.model(self.x, use_old_params=True)
# Training calculations
if huber_loss:
self.loss = self.huber_loss(self.train_targets, self.prediction)
else:
self.SE = tf.squared_difference(self.train_targets, self.prediction, name="SquaredError")
self.loss = tf.reduce_mean(self.SE, name="loss")
self.global_step = tf.Variable(0, trainable=False)
if decay_lr:
self.learning_rate = tf.train.exponential_decay(1e-4, self.global_step, 3000 * 200, 1e-5 / 1e-4)
self.opt_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op = self.opt_op.minimize(self.loss, global_step=self.global_step)
self.init_op = tf.global_variables_initializer()
if self.summaries_path is not None:
self.variable_summaries(self.loss, "loss", scalar_only=True)
self.variable_summaries(self.learning_rate, "learning_rate", scalar_only=True)
if self.checkpoints_dir is not None:
var_list = []
for l in range(len(self.layers_size) - 1):
var_list.append(self.weights[l])
var_list.append(self.biases[l])
self.saver = tf.train.Saver(var_list, pad_step_number=True)
if self.summaries_path is not None:
self.merged_summaries = tf.summary.merge_all()
self.summaries_path += "_{}".format(self.scope)
if not os.path.exists(self.summaries_path):
os.makedirs(self.summaries_path)
self.train_writer = tf.summary.FileWriter(self.summaries_path, graph=tf.get_default_graph())
else:
self.merged_summaries = None
self.session = None
def model(self, x, use_old_params=False):
z = []
hidden = [x]
for l in range(len(self.layers_size)-2):
if use_old_params:
z.append(tf.matmul(hidden[l], self.weights_old[l]) + self.biases_old[l])
else:
z.append(tf.matmul(hidden[l], self.weights[l]) + self.biases[l])
hidden.append(tf.nn.relu(z[l], name="hidden_" + str(l + 1)))
if use_old_params:
z.append(tf.matmul(hidden[-1], self.weights_old[-1]) + self.biases_old[-1])
else:
z.append(tf.matmul(hidden[-1], self.weights[-1]) + self.biases[-1])
if not use_old_params:
if self.summaries_path is not None:
with tf.name_scope('layers_summaries'):
for l in range(len(self.layers_size) - 1):
self.variable_summaries(z[l], "z" + str(l))
self.variable_summaries(hidden[l], "hidden" + str(l))
return z[-1] # Output layer has Identity units.
@staticmethod
def huber_loss(targets, predictions):
error = targets - predictions
fn_choice_maker1 = (tf.to_int32(tf.sign(error + 1)) + 1) / 2
fn_choice_maker2 = (tf.to_int32(tf.sign(-error + 1)) + 1) / 2
choice_maker_sqr = tf.to_float(tf.multiply(fn_choice_maker1, fn_choice_maker2))
sqr_contrib = tf.multiply(choice_maker_sqr, tf.square(error)*0.5)
abs_contrib = tf.abs(error)-0.5 - tf.multiply(choice_maker_sqr, tf.abs(error)-0.5)
loss = tf.reduce_mean(sqr_contrib + abs_contrib)
return loss
def init_tf_session(self):
if self.session is None:
self.session = tf.Session()
self.session.run(self.init_op) # Global Variables Initializer (init op)
def predict(self, states, use_old_params=False):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states}
if use_old_params:
q = self.session.run(self.prediction_with_old_params, feed_dict=feed_dict)
else:
q = self.session.run(self.prediction, feed_dict=feed_dict)
return q
def train(self, states, targets):
self.init_tf_session() # Make sure the Tensorflow session exists
feed_dict = {self.x: states, self.train_targets: targets}
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
fetches = [self.loss, self.train_op, self.merged_summaries]
else:
fetches = [self.loss, self.train_op]
values = self.session.run(fetches, feed_dict=feed_dict)
if self.summaries_path is not None and self.n_train_epochs % 2000 == 0:
self.train_writer.add_summary(values[2], global_step=self.n_train_epochs)
if self.checkpoints_dir is not None and self.n_train_epochs % 40000 == 0:
self.saver.save(self.session, self.checkpoints_dir, global_step=self.global_step)
self.n_train_epochs += 1
return values[0]
@staticmethod
def variable_summaries(var, name, histogram=False, scalar_only=False):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
if scalar_only:
tf.summary.scalar(name, var)
else:
mean = tf.reduce_mean(var)
tf.summary.scalar(name+'_mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar(name+'_stddev', stddev)
tf.summary.scalar(name+'_max', tf.reduce_max(var))
tf.summary.scalar(name+'_min', tf.reduce_min(var))
if histogram:
tf.summary.histogram(name+'_histogram', var)
def update_old_params(self):
self.init_tf_session() # Make sure the Tensorflow session exists
update_ops = []
for l in range(len(self.layers_size) - 1):
update_ops.append(self.weights_old[l].assign(self.weights[l]))
update_ops.append(self.biases_old[l].assign(self.biases[l]))
self.session.run(update_ops)
def close_summary_file(self):
if self.summaries_path is not None:
self.train_writer.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment