Skip to content

Instantly share code, notes, and snippets.

@BIGBALLON
Last active September 26, 2017 17:51
Show Gist options
  • Save BIGBALLON/c3da0ffd8090e6c5dd569076a604e202 to your computer and use it in GitHub Desktop.
Save BIGBALLON/c3da0ffd8090e6c5dd569076a604e202 to your computer and use it in GitHub Desktop.
OpenAI CartPole-v0 DQN.
"""
modfied from MorvanZhou' code!
Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou
More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
Dependencies:
tensorflow: 1.1.0
matplotlib
numpy
gym: 0.8.1
"""
import tensorflow as tf
import numpy as np
import gym
tf.set_random_seed(1)
np.random.seed(1)
# Hyper Parameters
EPISODE = 500
EPSILON_MIN = 0.01
EPSILON_DECAY= 0.996
BATCH_SIZE = 32
LR = 0.001 # learning rate
EPSILON = 1.0 # greedy policy
GAMMA = 0.95 # reward discount
TARGET_REPLACE_ITER = 10 # target update frequency
MEMORY_CAPACITY = 2000
MEMORY_COUNTER = 0 # for store experience
LEARNING_STEP_COUNTER = 0 # for target updating
env = gym.make('CartPole-v0') #.unwrapped
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2)) # initialize memory
# tf placeholders
tf_s = tf.placeholder(tf.float32, [None, N_STATES])
tf_a = tf.placeholder(tf.int32, [None, ])
tf_r = tf.placeholder(tf.float32, [None, ])
tf_s_ = tf.placeholder(tf.float32, [None, N_STATES])
with tf.variable_scope('q'): # evaluation network
l_eval = tf.layers.dense(tf_s, 24, tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.he_normal())
q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.contrib.keras.initializers.he_normal())
with tf.variable_scope('q_next'): # target network, not to train
l_target = tf.layers.dense(tf_s_, 24, tf.nn.relu, trainable=False)
q_next = tf.layers.dense(l_target, N_ACTIONS, trainable=False)
q_target = tf_r + GAMMA * tf.reduce_max(q_next, axis=1) # shape=(None, ),
a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype=tf.int32), tf_a], axis=1)
q_wrt_a = tf.gather_nd(params=q, indices=a_indices) # shape=(None, ), q for current state
loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
train_op = tf.train.AdamOptimizer(LR).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
def choose_action(s):
s = s[np.newaxis, :]
if np.random.uniform() > EPSILON:
# forward feed the observation and get q value for every actions
actions_value = sess.run(q, feed_dict={tf_s: s})
action = np.argmax(actions_value)
else:
action = np.random.randint(0, N_ACTIONS)
return action
def store_transition(s, a, r, s_):
global MEMORY_COUNTER
transition = np.hstack((s, [a, r], s_))
# replace the old memory with new memory
index = MEMORY_COUNTER % MEMORY_CAPACITY
MEMORY[index, :] = transition
MEMORY_COUNTER += 1
def learn():
# update target net
global LEARNING_STEP_COUNTER
global EPSILON
if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0:
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_next')
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q')
sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
LEARNING_STEP_COUNTER += 1
# learning
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
b_memory = MEMORY[sample_index, :]
b_s = b_memory[:, :N_STATES]
b_a = b_memory[:, N_STATES].astype(int)
b_r = b_memory[:, N_STATES+1]
b_s_ = b_memory[:, -N_STATES:]
sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_})
if EPSILON > EPSILON_MIN:
EPSILON = EPSILON * EPSILON_DECAY
print('\nCollecting experience...')
for i_episode in range(EPISODE):
s = env.reset()
ep_r = 0
while True:
env.render()
a = choose_action(s)
# take action
s_, r, done, info = env.step(a)
ep_r += r
if done:
r = -10
store_transition(s, a, r, s_)
if MEMORY_COUNTER > MEMORY_CAPACITY:
learn()
if done:
print('Ep: ', i_episode,
'| Ep_r: ', round(ep_r, 2))
if done:
break
s = s_
@BIGBALLON
Copy link
Author

BIGBALLON commented Sep 26, 2017

修改自莫烦的code,因为原本莫烦使用了自己定义的reward,这里修改为环境原来的reward
另外,weight initial换成了He‘s init,
Learning Rate 修改为0.001
增加了epsilon decay!
在最后一个状态(if done),讲reward设置为-10,以加大惩罚。
大概在250个episode左右score就能稳定在200.
训练图

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment