Last active
August 17, 2017 16:11
-
-
Save lirnli/332faf82d99911c9b6e87ea76a2a4cb1 to your computer and use it in GitHub Desktop.
MountainCar-v0 Double Deep Q Net
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Two Q nets are used. They use themsevleves to estimate Q(t,a) but use the other to estimate argmax Q(t+1,a). | |
# Still use one-step algorithm in training Q nets. | |
# Based on arXiv:1509.06461 [cs.LG] | |
# https://lirnli.wordpress.com/2017/08/17/debugging-reinforcement-neural-network-deep-q-net/ | |
# | |
# Hyperparameter summary: | |
# reward decay rate = 0.999 | |
# memory relay (with 10000 observations) | |
# AdamOptimizer, with learning rate decay | |
# tanh activation function | |
# input(2) -> layer1(64) -> layer2(128) -> outupt(3) | |
# l2_regularization | |
# | |
# Some observations: | |
# Use a large reward decay rate, so final state rewards can propagate back, in theory. E.g. 0.999*200 = 0.8186. | |
# The net needs to be much larger than I thought at first. | |
# Learning_rate decay helps finalize the net. (Feels kind of cheating, but still better than early stopping.) This is | |
# because the final training stage is very sensitive, and performance decreases dramatically within a few iterations. | |
import numpy as np | |
import tensorflow as tf | |
from matplotlib import pyplot as plt | |
import gym | |
from gym import wrappers | |
import random as rnd | |
from collections import deque | |
from IPython import display | |
from datetime import datetime | |
GYM_NAME = 'MountainCar-v0' | |
# GYM_NAME = 'CartPole-v0' | |
env = gym.make(GYM_NAME) | |
n_obs, = env.observation_space.shape | |
n_action = env.action_space.n | |
env.close() | |
### Hyperparameters | |
tf.reset_default_graph() | |
start_learning_rate = 0.001 | |
gamma = 0.999 # decay_rate | |
momentum = 0.9 | |
memory_cap = 10000 | |
max_episode = 3000 | |
batch_size = 256 | |
memory_warmup = 2*batch_size | |
# merge_net_freq = 4000 | |
reg_scale = 0.01 | |
save_path = './Double_DQN.ckpt' | |
he_init = tf.contrib.layers.variance_scaling_initializer() | |
xavier_init = tf.contrib.layers.xavier_initializer() | |
l1_reg = tf.contrib.layers.l1_regularizer(reg_scale) | |
l2_reg = tf.contrib.layers.l2_regularizer(reg_scale) | |
### Set up net and cost function | |
training = tf.placeholder_with_default(False, shape=()) | |
def create_q_model(X, name='None'): | |
with tf.variable_scope(name) as scope: | |
hid1 = tf.layers.dense(X,64,activation=tf.nn.tanh,kernel_initializer=he_init,kernel_regularizer=l2_reg) | |
# dropout1 = tf.layers.dropout(hid1,rate=0.5,training=training) | |
hid2 = tf.layers.dense(hid1,128,activation=tf.nn.tanh,kernel_initializer=he_init,kernel_regularizer=l2_reg) | |
# dropout2 = tf.layers.dropout(hid2,rate=0.5,training=training) | |
# hid3 = tf.layers.dense(hid2,128,activation=tf.nn.tanh,kernel_initializer=he_init)#,kernel_regularizer=l2_reg) | |
q = tf.layers.dense(hid2, n_action, kernel_initializer=he_init) | |
q_vars = {var.name[len(scope.name):]: var \ | |
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=scope.name)} | |
return q, q_vars | |
global_step = tf.Variable(0, trainable=False) | |
learning_rate = tf.train.exponential_decay(start_learning_rate, global_step,350000,0.01) | |
X = tf.placeholder(tf.float32, shape=[None,n_obs]) | |
q_net, _ = create_q_model(X,name='q_net') | |
target_net, _ = create_q_model(X,name='target_net') | |
action_ph = tf.placeholder(tf.int32, shape=(None,)) | |
q1_ph = tf.placeholder(tf.float32, shape=(None,)) | |
q_net_0 = tf.reduce_sum(q_net*tf.one_hot(action_ph,n_action),axis=1) | |
target_net_0 = tf.reduce_sum(target_net*tf.one_hot(action_ph,n_action),axis=1) | |
cost_q_net = tf.square(q1_ph-q_net_0) | |
cost_target_net = tf.square(q1_ph-target_net_0) | |
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) | |
# optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) | |
train_op_q_net = optimizer.minimize(cost_q_net,global_step=global_step) | |
train_op_target_net = optimizer.minimize(cost_target_net) | |
### Exploration policy | |
def epsilon_greedy(q_values, step = 0, eps_min = 0.05, eps_max = 1.0,eps_decay_steps = 100000): | |
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps) | |
if rnd.random() < epsilon: | |
return rnd.randint(0,n_action-1) # random action | |
else: | |
return np.argmax(q_values) # optimal action | |
### Warmup memory and start training | |
init = tf.global_variables_initializer() | |
env = gym.make(GYM_NAME) | |
env = wrappers.Monitor(env,'./tmp/',force=True) | |
obs = env.reset() | |
action = env.action_space.sample() | |
memory = deque(maxlen=memory_cap) | |
mem_prob = deque(maxlen=memory_cap) | |
saver = tf.train.Saver() | |
with tf.Session() as sess: | |
init.run() | |
episode = 0 | |
iteration = 0 | |
episode_reward = 0 | |
while episode < max_episode: | |
print('\riteration {}, episode {}, learning_rate {:8g}'.format(iteration, episode,learning_rate.eval()), end='') | |
prev_obs, prev_action = obs, action | |
obs, reward, done, _ = env.step(action) | |
q_net_val = q_net.eval(feed_dict={X:np.expand_dims(obs,0)}) | |
action1 = epsilon_greedy(q_net_val,step=episode,eps_decay_steps=100) | |
# target_net_val = target_net.eval(feed_dict={X:np.expand_dims(obs,0)}) | |
# action2 = epsilon_greedy(target_net_val,step=episode,eps_min=0.5,eps_decay_steps=50) | |
# action = np.random.choice([action1,action2]) | |
action = action1 | |
memory.append([prev_obs, obs, reward, done, prev_action]) | |
mem_prob.append(10) | |
episode_reward += reward | |
if iteration>= memory_warmup: | |
# idx = np.random.choice(np.arange(len(mem_prob)), \ | |
# size = batch_size, \ | |
# replace=False, \ | |
# p=np.array(mem_prob)/np.sum(mem_prob)) | |
idx = np.random.permutation(len(memory))[:batch_size] | |
extract_mem = lambda k : np.array([memory[i][k] for i in idx]) | |
prev_obs_batch = extract_mem(0) | |
obs_batch = extract_mem(1) | |
reward_batch = extract_mem(2) | |
done_batch = extract_mem(3) | |
action_batch = extract_mem(4) | |
q_net_val,target_net_val = sess.run([q_net,target_net],feed_dict={X:obs_batch}) | |
q_batch = reward_batch + gamma*np.amax(q_net_val,axis=1)*(1-done_batch) | |
target_batch = reward_batch + gamma*np.amax(target_net_val,axis=1)*(1-done_batch) | |
train_op_q_net.run(feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:target_batch}) | |
# cost_val, _ = sess.run([cost_q_net,train_op_q_net], \ | |
# feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:target_batch}) | |
train_op_target_net.run(feed_dict={X:prev_obs_batch,action_ph:action_batch,q1_ph:q_batch}) | |
# for b,i in enumerate(idx): | |
# mem_prob[i] = cost_val[b] | |
if done: | |
if episode%10 ==0 and iteration>=memory_warmup: | |
print(' episode_reward {}'.format(episode_reward)) | |
feed_dict = {X:prev_obs_batch,action_ph:action_batch,q1_ph:q_batch} | |
cost_val = sess.run([cost_q_net], feed_dict = feed_dict) | |
print('\tcost_val {:8.4f}, q_net_val {:8.4f}, net_diff {:8.4f} / {:8.4f}'.format( \ | |
np.mean(cost_val), np.mean(q_net_val), \ | |
np.mean(q_net_val-target_net_val), np.std(q_net_val-target_net_val))) | |
obs = env.reset() | |
action = env.action_space.sample() | |
episode_reward = 0 | |
episode += 1 | |
if iteration%5000==0: | |
saver.save(sess,save_path) | |
iteration += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment