BIGBALLON · September 26, 2017 17:51 · BIGBALLON · Sep 26, 2017
diff --git a/tensorflow_dqn.py b/tensorflow_dqn.py
 """
 modfied from MorvanZhou' code!

 Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
 My Youtube Channel: https://www.youtube.com/user/MorvanZhou
 More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/

 Dependencies:
 tensorflow: 1.1.0
 matplotlib
 numpy
 gym: 0.8.1
 """
 import tensorflow as tf
 import numpy as np
 import gym

 tf.set_random_seed(1)
 np.random.seed(1)

 # Hyper Parameters

 EPISODE = 500
 EPSILON_MIN = 0.01
 EPSILON_DECAY= 0.996
 BATCH_SIZE = 32
 LR = 0.001                    # learning rate
 EPSILON = 1.0                 # greedy policy
 GAMMA = 0.95                  # reward discount
 TARGET_REPLACE_ITER = 10      # target update frequency
 MEMORY_CAPACITY = 2000
 MEMORY_COUNTER = 0            # for store experience
 LEARNING_STEP_COUNTER = 0     # for target updating
 env = gym.make('CartPole-v0') #.unwrapped

 N_ACTIONS = env.action_space.n
 N_STATES = env.observation_space.shape[0]
 MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory

 # tf placeholders
 tf_s = tf.placeholder(tf.float32, [None, N_STATES])
 tf_a = tf.placeholder(tf.int32, [None, ])
 tf_r = tf.placeholder(tf.float32, [None, ])
 tf_s_ = tf.placeholder(tf.float32, [None, N_STATES])

 with tf.variable_scope('q'):        # evaluation network
    l_eval = tf.layers.dense(tf_s, 24, tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.he_normal())
    q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.contrib.keras.initializers.he_normal())

 with tf.variable_scope('q_next'):   # target network, not to train
    l_target = tf.layers.dense(tf_s_, 24, tf.nn.relu, trainable=False)
    q_next = tf.layers.dense(l_target, N_ACTIONS, trainable=False)

 q_target = tf_r + GAMMA * tf.reduce_max(q_next, axis=1)                   # shape=(None, ),

 a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype=tf.int32), tf_a], axis=1)
 q_wrt_a = tf.gather_nd(params=q, indices=a_indices)     # shape=(None, ), q for current state

 loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
 train_op = tf.train.AdamOptimizer(LR).minimize(loss)

 sess = tf.Session()
 sess.run(tf.global_variables_initializer())


 def choose_action(s):
    s = s[np.newaxis, :]
    if np.random.uniform() > EPSILON:
        # forward feed the observation and get q value for every actions
        actions_value = sess.run(q, feed_dict={tf_s: s})
        action = np.argmax(actions_value)
    else:
        action = np.random.randint(0, N_ACTIONS)
    return action


 def store_transition(s, a, r, s_):
    global MEMORY_COUNTER
    transition = np.hstack((s, [a, r], s_))
    # replace the old memory with new memory
    index = MEMORY_COUNTER % MEMORY_CAPACITY
    MEMORY[index, :] = transition
    MEMORY_COUNTER += 1


 def learn():
    # update target net
    global LEARNING_STEP_COUNTER
    global EPSILON
    if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0:
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_next')
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q')
        sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
    LEARNING_STEP_COUNTER += 1

    # learning
    sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
    b_memory = MEMORY[sample_index, :]
    b_s = b_memory[:, :N_STATES]
    b_a = b_memory[:, N_STATES].astype(int)
    b_r = b_memory[:, N_STATES+1]
    b_s_ = b_memory[:, -N_STATES:]
    sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_})
    if EPSILON > EPSILON_MIN:
        EPSILON = EPSILON * EPSILON_DECAY


 print('\nCollecting experience...')
 for i_episode in range(EPISODE):
    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = choose_action(s)

        # take action
        s_, r, done, info = env.step(a)
        ep_r += r
        if done:
            r = -10
        store_transition(s, a, r, s_)

        
        if MEMORY_COUNTER > MEMORY_CAPACITY:
            learn()
            if done:
                print('Ep: ', i_episode,
                      '| Ep_r: ', round(ep_r, 2))

        if done:
            break
        s = s_
	"""
	modfied from MorvanZhou' code!

	Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
	My Youtube Channel: https://www.youtube.com/user/MorvanZhou
	More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/

	Dependencies:
	tensorflow: 1.1.0
	matplotlib
	numpy
	gym: 0.8.1
	"""
	import tensorflow as tf
	import numpy as np
	import gym

	tf.set_random_seed(1)
	np.random.seed(1)

	# Hyper Parameters

	EPISODE = 500
	EPSILON_MIN = 0.01
	EPSILON_DECAY= 0.996
	BATCH_SIZE = 32
	LR = 0.001 # learning rate
	EPSILON = 1.0 # greedy policy
	GAMMA = 0.95 # reward discount
	TARGET_REPLACE_ITER = 10 # target update frequency
	MEMORY_CAPACITY = 2000
	MEMORY_COUNTER = 0 # for store experience
	LEARNING_STEP_COUNTER = 0 # for target updating
	env = gym.make('CartPole-v0') #.unwrapped

	N_ACTIONS = env.action_space.n
	N_STATES = env.observation_space.shape[0]
	MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2)) # initialize memory

	# tf placeholders
	tf_s = tf.placeholder(tf.float32, [None, N_STATES])
	tf_a = tf.placeholder(tf.int32, [None, ])
	tf_r = tf.placeholder(tf.float32, [None, ])
	tf_s_ = tf.placeholder(tf.float32, [None, N_STATES])

	with tf.variable_scope('q'): # evaluation network
	l_eval = tf.layers.dense(tf_s, 24, tf.nn.relu, kernel_initializer=tf.contrib.keras.initializers.he_normal())
	q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.contrib.keras.initializers.he_normal())

	with tf.variable_scope('q_next'): # target network, not to train
	l_target = tf.layers.dense(tf_s_, 24, tf.nn.relu, trainable=False)
	q_next = tf.layers.dense(l_target, N_ACTIONS, trainable=False)

	q_target = tf_r + GAMMA * tf.reduce_max(q_next, axis=1) # shape=(None, ),

	a_indices = tf.stack([tf.range(tf.shape(tf_a)[0], dtype=tf.int32), tf_a], axis=1)
	q_wrt_a = tf.gather_nd(params=q, indices=a_indices) # shape=(None, ), q for current state

	loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
	train_op = tf.train.AdamOptimizer(LR).minimize(loss)

	sess = tf.Session()
	sess.run(tf.global_variables_initializer())


	def choose_action(s):
	s = s[np.newaxis, :]
	if np.random.uniform() > EPSILON:
	# forward feed the observation and get q value for every actions
	actions_value = sess.run(q, feed_dict={tf_s: s})
	action = np.argmax(actions_value)
	else:
	action = np.random.randint(0, N_ACTIONS)
	return action


	def store_transition(s, a, r, s_):
	global MEMORY_COUNTER
	transition = np.hstack((s, [a, r], s_))
	# replace the old memory with new memory
	index = MEMORY_COUNTER % MEMORY_CAPACITY
	MEMORY[index, :] = transition
	MEMORY_COUNTER += 1


	def learn():
	# update target net
	global LEARNING_STEP_COUNTER
	global EPSILON
	if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0:
	t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_next')
	e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q')
	sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
	LEARNING_STEP_COUNTER += 1

	# learning
	sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
	b_memory = MEMORY[sample_index, :]
	b_s = b_memory[:, :N_STATES]
	b_a = b_memory[:, N_STATES].astype(int)
	b_r = b_memory[:, N_STATES+1]
	b_s_ = b_memory[:, -N_STATES:]
	sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_})
	if EPSILON > EPSILON_MIN:
	EPSILON = EPSILON * EPSILON_DECAY


	print('\nCollecting experience...')
	for i_episode in range(EPISODE):
	s = env.reset()
	ep_r = 0
	while True:
	env.render()
	a = choose_action(s)

	# take action
	s_, r, done, info = env.step(a)
	ep_r += r
	if done:
	r = -10
	store_transition(s, a, r, s_)


	if MEMORY_COUNTER > MEMORY_CAPACITY:
	learn()
	if done:
	print('Ep: ', i_episode,
	'\| Ep_r: ', round(ep_r, 2))

	if done:
	break
	s = s_