Skip to content

Instantly share code, notes, and snippets.

@MattChanTK
Created November 14, 2016 08:47
Show Gist options
  • Select an option

  • Save MattChanTK/f39a4b3d541dcfa43ba225f62c0a5bcb to your computer and use it in GitHub Desktop.

Select an option

Save MattChanTK/f39a4b3d541dcfa43ba225f62c0a5bcb to your computer and use it in GitHub Desktop.
import random
import numpy as np
import math
from time import perf_counter
import os
import sys
from collections import deque
import gym
import cntk
from cntk.layers import Dense
from cntk.models import Sequential
env = gym.make('CartPole-v0')
STATE_DIM = env.observation_space.shape[0]
NUM_ACTIONS = env.action_space.n
'''
The neural network and model
'''
class Brain:
BATCH_SIZE = 50
def __init__(self):
#### Construct the model ####
observation = cntk.ops.input_variable(STATE_DIM, np.float32, name="s")
q_target = cntk.ops.input_variable(NUM_ACTIONS, np.float32, name="q")
# Define the structre of the neural network
fully_connected_layer = Dense(64, activation=cntk.ops.relu)
output_layer = Dense(NUM_ACTIONS)
self.model = Sequential([fully_connected_layer, output_layer])(observation)
self.params= {"W1": fully_connected_layer.W,
"b1": fully_connected_layer.b,
"W2": output_layer.W,
"b2": output_layer.b }
#### Define the trainer ####
self.learning_rate = 0.00025
self.loss = cntk.ops.reduce_mean(cntk.ops.square(self.model - q_target), axis=0)
mean_error = cntk.ops.reduce_mean(cntk.ops.square(self.model - q_target), axis=0)
learner = cntk.sgd(self.model.parameters, self.learning_rate/self.BATCH_SIZE, gradient_clipping_threshold_per_sample=10)
self.trainer = cntk.Trainer(self.model, self.loss, mean_error, learner)
def train(self, x, y, epoch=1, verbose=0):
data = dict(zip(self.loss.arguments, [y, x]))
self.trainer.train_minibatch(data, outputs=[self.loss.output])
def predict(self, s):
return self.model.eval(s)
class Memory: # stored as ( s, a, r, s' )
def __init__(self, capacity):
self.examplers = deque(maxlen=capacity)
self.capacity = capacity
def add(self, sample):
self.examplers.append(sample)
def get_random_samples(self, num_samples):
num_samples = min(num_samples, len(self.examplers))
return random.sample(tuple(self.examplers), num_samples)
class Agent:
MEMORY_CAPACITY = 100000
DISCOUNT_FACTOR = 0.99
MAX_EXPLORATION_RATE = 1.0
MIN_EXPLORATION_RATE = 0.01
DECAY_RATE = 0.0001
def __init__(self):
self.explore_rate = self.MAX_EXPLORATION_RATE
self.brain = Brain()
self.memory = Memory(self.MEMORY_CAPACITY)
self.steps = 0
def act(self, s):
if random.random() < self.explore_rate:
return random.randint(0, NUM_ACTIONS - 1)
else:
return np.argmax(self.brain.predict(s))
def observe(self, sample):
self.steps += 1
self.memory.add(sample)
# Reduces exploration rate linearly
self.explore_rate = self.MIN_EXPLORATION_RATE + (self.MAX_EXPLORATION_RATE - self.MIN_EXPLORATION_RATE) * math.exp(-self.DECAY_RATE * self.steps)
def replay(self):
batch = self.memory.get_random_samples(self.brain.BATCH_SIZE)
batchLen = len(batch)
states = np.array([sample[0] for sample in batch], dtype=np.float32)
no_state = np.zeros(STATE_DIM)
resultant_states = np.array([(no_state if sample[3] is None else sample[3]) for sample in batch], dtype=np.float32)
q_values_batch = self.brain.predict(states)
future_q_values_batch = self.brain.predict(resultant_states)
x = np.zeros((batchLen, STATE_DIM)).astype(np.float32)
y = np.zeros((batchLen, NUM_ACTIONS)).astype(np.float32)
for i in range(batchLen):
state, action, reward, resultant_state = batch[i]
q_values = q_values_batch[0][i]
if resultant_state is None:
q_values[action] = reward
else:
q_values[action] = reward + self.DISCOUNT_FACTOR * np.amax(future_q_values_batch[0][i])
x[i] = state
y[i] = q_values
self.brain.train(x, y)
def run_simulation(agent):
state = env.reset()
reward_sum = 0
while True:
# env.render()
action = agent.act(state.astype(np.float32))
resultant_state, reward, done, info = env.step(action)
if done: # terminal state
resultant_state = None
agent.observe((state, action, reward, resultant_state))
agent.replay()
state = resultant_state
reward_sum += reward
if done:
return reward_sum
def test(model_path, num_episodes=10):
root = cntk.load_model(model_path)
observation = env.reset() # reset environment for new episode
done = False
for episode in range(num_episodes):
while not done:
env.render()
action = np.argmax(root.eval(observation.astype(np.float32)))
observation, reward, done, info = env.step(action)
if done:
observation = env.reset() # reset environment for new episode
if __name__ == "__main__":
GYM_ENABLE_UPLOAD = True
GYM_VIDEO_PATH = os.path.join(os.getcwd(), "videos", "cart_pole_dpn_cntk")
MAX_NUM_EPISODES = 5000
STREAK_TO_END = 120
DONE_REWARD_LEVEL = 200
TRAINED_MODEL_DIR = os.path.join(os.getcwd(), "trained_models")
if not os.path.exists(TRAINED_MODEL_DIR):
os.makedirs(TRAINED_MODEL_DIR)
TRAINED_MODEL_NAME = "cart_pole_dpn.mod"
if len(sys.argv) < 2 or sys.argv[1] != "test_only":
if GYM_ENABLE_UPLOAD:
env.monitor.start(GYM_VIDEO_PATH, force=True)
agent = Agent()
episode_number = 0
num_streaks = 0
reward_sum = 0
solved_episode = -1
training_start_time = perf_counter()
while episode_number < MAX_NUM_EPISODES:
# Run the simulation and train the agent
reward_sum += run_simulation(agent)
episode_number += 1
if episode_number % agent.brain.BATCH_SIZE == 0:
t = perf_counter() - training_start_time
print("(%d s) Episode: %d, Average reward = %f." % (t, episode_number, reward_sum / agent.brain.BATCH_SIZE))
# It is considered solved when the sum of reward is over 200
if reward_sum / agent.brain.BATCH_SIZE > DONE_REWARD_LEVEL:
num_streaks += 1
solved_episode = episode_number
else:
num_streaks = 0
solved_episode = -1
# It's considered done when it's solved over 120 times consecutively
if num_streaks > STREAK_TO_END:
print("Task solved in %d episodes and repeated %d times." % (episode_number, num_streaks))
break
reward_sum = 0
agent.brain.model.save_model(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), False)
if GYM_ENABLE_UPLOAD:
env.monitor.close()
gym.upload(GYM_VIDEO_PATH, api_key=GYM_API_KEY)
# testing the model
test(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), num_episodes=10)
@MattChanTK
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment