Forked from imironhead/openai_cartpole_v0_dqn.py
Last active
September 8, 2016 07:19
-
-
Save jkarnows/522c2d6000e519482b6deb825d17b34b to your computer and use it in GitHub Desktop.
jkarnows-cartpole-v0-dqn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Solve OpenAI Gym Cartpole V0 with DQN. | |
TensorFlow code by TiehHung Chuang (imironhead), 2016-09-06 | |
tf-slim code by Jeremy Karnowski jkarnows, 2016-09-07 | |
""" | |
from __future__ import absolute_import | |
from __future__ import print_function | |
from __future__ import division | |
import numpy as np | |
import random | |
import gym | |
import tensorflow as tf | |
import tensorflow.contrib.slim as slim | |
class DeepQLearningAgent(object): | |
def __init__(self, state_space, action_space, network_layers): | |
self._action_space = action_space | |
self._dim_state = state_space.shape[0] | |
self._dim_action = action_space.n | |
self._batch_size = 200 | |
self._gamma = 0.95 | |
self._prev_state = None | |
self._prev_action = None | |
self._prev_reward = 0 | |
prev_states = tf.placeholder(tf.float32, [None, self._dim_state]) | |
net = slim.stack(prev_states, slim.fully_connected, network_layers, activation_fn=tf.nn.relu, scope='fc') | |
prev_action_values = slim.fully_connected(net, 2, activation_fn=None, scope='qvalues') | |
prev_action_masks = tf.placeholder(tf.float32, [None, self._dim_action]) | |
prev_values = tf.reduce_sum(tf.mul(prev_action_values, prev_action_masks), reduction_indices=1) | |
prev_rewards = tf.placeholder(tf.float32, [None, ]) | |
next_states = tf.placeholder(tf.float32, [None, self._dim_state]) | |
net = slim.stack(next_states, slim.fully_connected, [128,128,128], activation_fn=tf.nn.relu, scope='fc', reuse=True) | |
next_action_values = slim.fully_connected(net, 2, activation_fn=None, scope='qvalues', reuse=True) | |
next_values = prev_rewards + self._gamma * tf.reduce_max(next_action_values, reduction_indices=1) | |
loss = tf.reduce_mean(tf.square(prev_values - next_values)) | |
training = tf.train.AdamOptimizer(1e-4).minimize(loss) | |
self._tf_action_value_predict = prev_action_values | |
self._tf_prev_states = prev_states | |
self._tf_prev_action_masks = prev_action_masks | |
self._tf_prev_rewards = prev_rewards | |
self._tf_next_states = next_states | |
self._tf_training = training | |
self._tf_loss = loss | |
self._tf_session = tf.InteractiveSession() | |
self._tf_session.run(tf.initialize_all_variables()) | |
# Build the D which keeps experiences. | |
self._time = 0 | |
self._epsilon = 1.0 | |
self._epsilon_decay_time = 100 | |
self._epsilon_decay_rate = 0.9 | |
self._experiences_max = 1000 | |
self._experiences_num = 0 | |
self._experiences_prev_states = np.zeros((self._experiences_max, self._dim_state)) | |
self._experiences_next_states = np.zeros((self._experiences_max, self._dim_state)) | |
self._experiences_rewards = np.zeros((self._experiences_max)) | |
self._experiences_actions_mask = np.zeros((self._experiences_max, self._dim_action)) | |
def create_experience(self, prev_state, prev_action, reward, next_state): | |
""" | |
keep an experience for later training. | |
""" | |
if self._experiences_num >= self._experiences_max: | |
idx = np.random.choice(self._experiences_max) | |
else: | |
idx = self._experiences_num | |
self._experiences_num += 1 | |
self._experiences_prev_states[idx] = np.array(prev_state) | |
self._experiences_next_states[idx] = np.array(next_state) | |
self._experiences_rewards[idx] = reward | |
self._experiences_actions_mask[idx] = np.zeros(self._dim_action) | |
self._experiences_actions_mask[idx, prev_action] = 1.0 | |
def train(self): | |
""" | |
train the deep q-learning network. | |
""" | |
# start training only when there are enough experiences. | |
if self._experiences_num < self._experiences_max: | |
return | |
ixs = np.random.choice(self._experiences_max, self._batch_size, replace=True) | |
fatches = [self._tf_loss, self._tf_training] | |
feed = { | |
self._tf_prev_states: self._experiences_prev_states[ixs], | |
self._tf_prev_action_masks: self._experiences_actions_mask[ixs], | |
self._tf_prev_rewards: self._experiences_rewards[ixs], | |
self._tf_next_states: self._experiences_next_states[ixs] | |
} | |
loss, _ = self._tf_session.run(fatches, feed_dict=feed) | |
def act(self, observation, reward, done): | |
""" | |
ask the next action from the agent | |
""" | |
self._time += 1 | |
if self._time % self._epsilon_decay_time == 0: | |
self._epsilon *= self._epsilon_decay_rate | |
if np.random.rand() > self._epsilon: | |
states = np.array([observation]) | |
action_values = self._tf_action_value_predict.eval( | |
feed_dict={self._tf_prev_states: states}) | |
action = np.argmax(action_values) | |
else: | |
action = self._action_space.sample() | |
if self._prev_state is not None: | |
if done: | |
reward = -500.0 | |
observation = np.zeros_like(observation) | |
self.create_experience( | |
self._prev_state, self._prev_action, reward, observation) | |
self._prev_state = None if done else observation | |
self._prev_action = None if done else action | |
self._prev_reward = 0 if done else self._prev_reward + reward | |
self.train() | |
return action | |
if __name__ == '__main__': | |
# Environment settings | |
max_episodes = 1000 | |
max_steps = 200 | |
# Agent settings | |
network_layers = [128, 128, 128] | |
# Recording settings | |
record = True | |
save_filename = '' | |
api_key = '' | |
algorithm_id='' # Deep Q-learning | |
# Initialize simulation | |
env = gym.make('CartPole-v0') | |
# Start recording | |
if record: | |
env.monitor.start(save_filename, force=True) | |
# Create agent | |
agent = DeepQLearningAgent(env.observation_space, env.action_space, network_layers) | |
# Run simulation | |
for episode in xrange(max_episodes): | |
observation, reward, done = env.reset(), 0.0, False | |
for step in xrange(max_steps): | |
action = agent.act(observation, reward, done) | |
if done or step + 1 == max_steps: | |
print("{} - {}".format(episode, step)) | |
break | |
observation, reward, done, _ = env.step(action) | |
# Stop recording and upload to gym | |
if record: | |
env.monitor.close() | |
gym.upload(save_filename, api_key=api_key, algorithm_id=algorithm_id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment