Last active
September 26, 2017 17:51
-
-
Save BIGBALLON/c3da0ffd8090e6c5dd569076a604e202 to your computer and use it in GitHub Desktop.
OpenAI CartPole-v0 DQN.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
modfied from MorvanZhou' code! | |
Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/ | |
My Youtube Channel: https://www.youtube.com/user/MorvanZhou | |
More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/ | |
Dependencies: | |
tensorflow: 1.1.0 | |
matplotlib | |
numpy | |
gym: 0.8.1 | |
""" | |
import tensorflow as tf | |
import numpy as np | |
import gym | |
tf.set_random_seed(1) | |
np.random.seed(1) | |
# Hyper Parameters | |
EPISODE = 500 | |
EPSILON_MIN = 0.01 | |
EPSILON_DECAY= 0.996 | |
BATCH_SIZE = 32 | |
LR = 0.001 # learning rate | |
EPSILON = 1.0 # greedy policy | |
GAMMA = 0.95 # reward discount | |
TARGET_REPLACE_ITER = 10 # target update frequency | |
MEMORY_CAPACITY = 2000 | |
MEMORY_COUNTER = 0 # for store experience | |
LEARNING_STEP_COUNTER = 0 # for target updating | |
env = gym.make('CartPole-v0') #.unwrapped | |
N_ACTIONS = env.action_space.n | |
N_STATES = env.observation_space.shape[0] | |
MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2)) # initialize memory | |
# tf placeholders | |
tf_s = tf.placeholder(tf.float32, [None, N_STATES]) | |
tf_a = tf.placeholder(tf.int32, [None, ]) | |
tf_r = tf.placeholder(tf.float32, [None, ]) | |
tf_s_ = tf.placeholder(tf.float32, [None, N_STATES]) | |
with tf.variable_scope('q'): # evaluation network | |
l_eval = tf.layers.dense(tf_s, 24, tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.he_normal()) | |
q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.contrib.keras.initializers.he_normal()) | |
with tf.variable_scope('q_next'): # target network, not to train | |
l_target = tf.layers.dense(tf_s_, 24, tf.nn.relu, trainable=False) | |
q_next = tf.layers.dense(l_target, N_ACTIONS, trainable=False) | |
q_target = tf_r + GAMMA * tf.reduce_max(q_next, axis=1) # shape=(None, ), | |
a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype=tf.int32), tf_a], axis=1) | |
q_wrt_a = tf.gather_nd(params=q, indices=a_indices) # shape=(None, ), q for current state | |
loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a)) | |
train_op = tf.train.AdamOptimizer(LR).minimize(loss) | |
sess = tf.Session() | |
sess.run(tf.global_variables_initializer()) | |
def choose_action(s): | |
s = s[np.newaxis, :] | |
if np.random.uniform() > EPSILON: | |
# forward feed the observation and get q value for every actions | |
actions_value = sess.run(q, feed_dict={tf_s: s}) | |
action = np.argmax(actions_value) | |
else: | |
action = np.random.randint(0, N_ACTIONS) | |
return action | |
def store_transition(s, a, r, s_): | |
global MEMORY_COUNTER | |
transition = np.hstack((s, [a, r], s_)) | |
# replace the old memory with new memory | |
index = MEMORY_COUNTER % MEMORY_CAPACITY | |
MEMORY[index, :] = transition | |
MEMORY_COUNTER += 1 | |
def learn(): | |
# update target net | |
global LEARNING_STEP_COUNTER | |
global EPSILON | |
if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0: | |
t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_next') | |
e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q') | |
sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) | |
LEARNING_STEP_COUNTER += 1 | |
# learning | |
sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE) | |
b_memory = MEMORY[sample_index, :] | |
b_s = b_memory[:, :N_STATES] | |
b_a = b_memory[:, N_STATES].astype(int) | |
b_r = b_memory[:, N_STATES+1] | |
b_s_ = b_memory[:, -N_STATES:] | |
sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_}) | |
if EPSILON > EPSILON_MIN: | |
EPSILON = EPSILON * EPSILON_DECAY | |
print('\nCollecting experience...') | |
for i_episode in range(EPISODE): | |
s = env.reset() | |
ep_r = 0 | |
while True: | |
env.render() | |
a = choose_action(s) | |
# take action | |
s_, r, done, info = env.step(a) | |
ep_r += r | |
if done: | |
r = -10 | |
store_transition(s, a, r, s_) | |
if MEMORY_COUNTER > MEMORY_CAPACITY: | |
learn() | |
if done: | |
print('Ep: ', i_episode, | |
'| Ep_r: ', round(ep_r, 2)) | |
if done: | |
break | |
s = s_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
修改自莫烦的code,因为原本莫烦使用了自己定义的reward,这里修改为环境原来的reward

另外,weight initial换成了He‘s init,
Learning Rate 修改为0.001
增加了epsilon decay!
在最后一个状态(if done),讲reward设置为-10,以加大惩罚。
大概在250个episode左右score就能稳定在200.