Skip to content

Instantly share code, notes, and snippets.

@lirnli
Last active August 17, 2017 16:11
Show Gist options
  • Save lirnli/332faf82d99911c9b6e87ea76a2a4cb1 to your computer and use it in GitHub Desktop.
Save lirnli/332faf82d99911c9b6e87ea76a2a4cb1 to your computer and use it in GitHub Desktop.
MountainCar-v0 Double Deep Q Net
# Two Q nets are used. They use themsevleves to estimate Q(t,a) but use the other to estimate argmax Q(t+1,a).
# Still use one-step algorithm in training Q nets.
# Based on arXiv:1509.06461 [cs.LG]
# https://lirnli.wordpress.com/2017/08/17/debugging-reinforcement-neural-network-deep-q-net/
#
# Hyperparameter summary:
# reward decay rate = 0.999
# memory relay (with 10000 observations)
# AdamOptimizer, with learning rate decay
# tanh activation function
# input(2) -> layer1(64) -> layer2(128) -> outupt(3)
# l2_regularization
#
# Some observations:
# Use a large reward decay rate, so final state rewards can propagate back, in theory. E.g. 0.999*200 = 0.8186.
# The net needs to be much larger than I thought at first.
# Learning_rate decay helps finalize the net. (Feels kind of cheating, but still better than early stopping.) This is
# because the final training stage is very sensitive, and performance decreases dramatically within a few iterations.
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import gym
from gym import wrappers
import random as rnd
from collections import deque
from IPython import display
from datetime import datetime
GYM_NAME = 'MountainCar-v0'
# GYM_NAME = 'CartPole-v0'
env = gym.make(GYM_NAME)
n_obs, = env.observation_space.shape
n_action = env.action_space.n
env.close()
### Hyperparameters
tf.reset_default_graph()
start_learning_rate = 0.001
gamma = 0.999 # decay_rate
momentum = 0.9
memory_cap = 10000
max_episode = 3000
batch_size = 256
memory_warmup = 2*batch_size
# merge_net_freq = 4000
reg_scale = 0.01
save_path = './Double_DQN.ckpt'
he_init = tf.contrib.layers.variance_scaling_initializer()
xavier_init = tf.contrib.layers.xavier_initializer()
l1_reg = tf.contrib.layers.l1_regularizer(reg_scale)
l2_reg = tf.contrib.layers.l2_regularizer(reg_scale)
### Set up net and cost function
training = tf.placeholder_with_default(False, shape=())
def create_q_model(X, name='None'):
with tf.variable_scope(name) as scope:
hid1 = tf.layers.dense(X,64,activation=tf.nn.tanh,kernel_initializer=he_init,kernel_regularizer=l2_reg)
# dropout1 = tf.layers.dropout(hid1,rate=0.5,training=training)
hid2 = tf.layers.dense(hid1,128,activation=tf.nn.tanh,kernel_initializer=he_init,kernel_regularizer=l2_reg)
# dropout2 = tf.layers.dropout(hid2,rate=0.5,training=training)
# hid3 = tf.layers.dense(hid2,128,activation=tf.nn.tanh,kernel_initializer=he_init)#,kernel_regularizer=l2_reg)
q = tf.layers.dense(hid2, n_action, kernel_initializer=he_init)
q_vars = {var.name[len(scope.name):]: var \
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=scope.name)}
return q, q_vars
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(start_learning_rate, global_step,350000,0.01)
X = tf.placeholder(tf.float32, shape=[None,n_obs])
q_net, _ = create_q_model(X,name='q_net')
target_net, _ = create_q_model(X,name='target_net')
action_ph = tf.placeholder(tf.int32, shape=(None,))
q1_ph = tf.placeholder(tf.float32, shape=(None,))
q_net_0 = tf.reduce_sum(q_net*tf.one_hot(action_ph,n_action),axis=1)
target_net_0 = tf.reduce_sum(target_net*tf.one_hot(action_ph,n_action),axis=1)
cost_q_net = tf.square(q1_ph-q_net_0)
cost_target_net = tf.square(q1_ph-target_net_0)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
train_op_q_net = optimizer.minimize(cost_q_net,global_step=global_step)
train_op_target_net = optimizer.minimize(cost_target_net)
### Exploration policy
def epsilon_greedy(q_values, step = 0, eps_min = 0.05, eps_max = 1.0,eps_decay_steps = 100000):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if rnd.random() < epsilon:
return rnd.randint(0,n_action-1) # random action
else:
return np.argmax(q_values) # optimal action
### Warmup memory and start training
init = tf.global_variables_initializer()
env = gym.make(GYM_NAME)
env = wrappers.Monitor(env,'./tmp/',force=True)
obs = env.reset()
action = env.action_space.sample()
memory = deque(maxlen=memory_cap)
mem_prob = deque(maxlen=memory_cap)
saver = tf.train.Saver()
with tf.Session() as sess:
init.run()
episode = 0
iteration = 0
episode_reward = 0
while episode < max_episode:
print('\riteration {}, episode {}, learning_rate {:8g}'.format(iteration, episode,learning_rate.eval()), end='')
prev_obs, prev_action = obs, action
obs, reward, done, _ = env.step(action)
q_net_val = q_net.eval(feed_dict={X:np.expand_dims(obs,0)})
action1 = epsilon_greedy(q_net_val,step=episode,eps_decay_steps=100)
# target_net_val = target_net.eval(feed_dict={X:np.expand_dims(obs,0)})
# action2 = epsilon_greedy(target_net_val,step=episode,eps_min=0.5,eps_decay_steps=50)
# action = np.random.choice([action1,action2])
action = action1
memory.append([prev_obs, obs, reward, done, prev_action])
mem_prob.append(10)
episode_reward += reward
if iteration>= memory_warmup:
# idx = np.random.choice(np.arange(len(mem_prob)), \
# size = batch_size, \
# replace=False, \
# p=np.array(mem_prob)/np.sum(mem_prob))
idx = np.random.permutation(len(memory))[:batch_size]
extract_mem = lambda k : np.array([memory[i][k] for i in idx])
prev_obs_batch = extract_mem(0)
obs_batch = extract_mem(1)
reward_batch = extract_mem(2)
done_batch = extract_mem(3)
action_batch = extract_mem(4)
q_net_val,target_net_val = sess.run([q_net,target_net],feed_dict={X:obs_batch})
q_batch = reward_batch + gamma*np.amax(q_net_val,axis=1)*(1-done_batch)
target_batch = reward_batch + gamma*np.amax(target_net_val,axis=1)*(1-done_batch)
train_op_q_net.run(feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:target_batch})
# cost_val, _ = sess.run([cost_q_net,train_op_q_net], \
# feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:target_batch})
train_op_target_net.run(feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:q_batch})
# for b,i in enumerate(idx):
# mem_prob[i] = cost_val[b]
if done:
if episode%10 ==0 and iteration>=memory_warmup:
print(' episode_reward {}'.format(episode_reward))
feed_dict = {X:prev_obs_batch,action_ph:action_batch,q1_ph:q_batch}
cost_val = sess.run([cost_q_net], feed_dict = feed_dict)
print('\tcost_val {:8.4f}, q_net_val {:8.4f}, net_diff {:8.4f} / {:8.4f}'.format( \
np.mean(cost_val), np.mean(q_net_val), \
np.mean(q_net_val-target_net_val), np.std(q_net_val-target_net_val)))
obs = env.reset()
action = env.action_space.sample()
episode_reward = 0
episode += 1
if iteration%5000==0:
saver.save(sess,save_path)
iteration += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment