Instantly share code, notes, and snippets.
Last active
November 15, 2016 03:44
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save MattChanTK/4630aec343ceb2105fbab192d9be6b09 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import random | |
| import numpy as np | |
| import math | |
| from time import perf_counter | |
| import os | |
| import sys | |
| from collections import deque | |
| import gym | |
| import cntk | |
| from cntk.layers import Dense | |
| from cntk.models import Sequential, LayerStack | |
| env = gym.make('CartPole-v0') | |
| STATE_DIM = env.observation_space.shape[0] | |
| NUM_ACTIONS = env.action_space.n | |
| class Brain: | |
| BATCH_SIZE = 50 | |
| def __init__(self): | |
| #### Construct the model #### | |
| observation = cntk.ops.input_variable(STATE_DIM, np.float32, name="s") | |
| q_target = cntk.ops.input_variable(NUM_ACTIONS, np.float32, name="q") | |
| # Define the structure of the neural network | |
| self.model = self.create_repeated_layers_neural_network(observation, NUM_ACTIONS, 2) | |
| #### Define the trainer #### | |
| self.learning_rate = 0.00025 | |
| self.loss = cntk.ops.reduce_mean(cntk.ops.square(self.model - q_target), axis=0) | |
| mean_error = cntk.ops.reduce_mean(cntk.ops.square(self.model - q_target), axis=0) | |
| learner = cntk.adam_sgd(self.model.parameters, self.learning_rate/self.BATCH_SIZE, momentum=0.9) | |
| self.trainer = cntk.Trainer(self.model, self.loss, mean_error, learner) | |
| def train(self, x, y): | |
| data = dict(zip(self.loss.arguments, [y, x])) | |
| self.trainer.train_minibatch(data, outputs=[self.loss.output]) | |
| def predict(self, s): | |
| return self.model.eval(s) | |
| @staticmethod | |
| def create_repeated_layers_neural_network(input_vars, out_dims, num_hidden_layers): | |
| input_dims = input_vars.shape[0] | |
| num_hidden_neurons = input_dims**3 | |
| hidden_layer = lambda: Dense(num_hidden_neurons, activation=cntk.ops.relu) | |
| output_layer = Dense(out_dims, activation=None) | |
| model = Sequential([LayerStack(num_hidden_layers, hidden_layer), | |
| output_layer])(input_vars) | |
| return model | |
| class Memory: | |
| def __init__(self, capacity): | |
| self.examplers = deque(maxlen=capacity) | |
| self.capacity = capacity | |
| def add(self, sample): | |
| self.examplers.append(sample) | |
| def get_random_samples(self, num_samples): | |
| num_samples = min(num_samples, len(self.examplers)) | |
| return random.sample(tuple(self.examplers), num_samples) | |
| class Agent: | |
| MEMORY_CAPACITY = 100000 | |
| DISCOUNT_FACTOR = 0.99 | |
| MAX_EXPLORATION_RATE = 1.0 | |
| MIN_EXPLORATION_RATE = 0.01 | |
| DECAY_RATE = 0.0001 | |
| def __init__(self): | |
| self.explore_rate = self.MAX_EXPLORATION_RATE | |
| self.brain = Brain() | |
| self.memory = Memory(self.MEMORY_CAPACITY) | |
| self.steps = 0 | |
| def act(self, s): | |
| if random.random() < self.explore_rate: | |
| return random.randint(0, NUM_ACTIONS - 1) | |
| else: | |
| return np.argmax(self.brain.predict(s)) | |
| def observe(self, sample): | |
| self.steps += 1 | |
| self.memory.add(sample) | |
| # Reduces exploration rate linearly | |
| self.explore_rate = self.MIN_EXPLORATION_RATE + (self.MAX_EXPLORATION_RATE - self.MIN_EXPLORATION_RATE) * math.exp(-self.DECAY_RATE * self.steps) | |
| def replay(self): | |
| batch = self.memory.get_random_samples(self.brain.BATCH_SIZE) | |
| batchLen = len(batch) | |
| states = np.array([sample[0] for sample in batch], dtype=np.float32) | |
| no_state = np.zeros(STATE_DIM) | |
| resultant_states = np.array([(no_state if sample[3] is None else sample[3]) for sample in batch], dtype=np.float32) | |
| q_values_batch = self.brain.predict(states) | |
| future_q_values_batch = self.brain.predict(resultant_states) | |
| x = np.zeros((batchLen, STATE_DIM)).astype(np.float32) | |
| y = np.zeros((batchLen, NUM_ACTIONS)).astype(np.float32) | |
| for i in range(batchLen): | |
| state, action, reward, resultant_state = batch[i] | |
| q_values = q_values_batch[0][i] | |
| if resultant_state is None: | |
| q_values[action] = reward | |
| else: | |
| q_values[action] = reward + self.DISCOUNT_FACTOR * np.amax(future_q_values_batch[0][i]) | |
| x[i] = state | |
| y[i] = q_values | |
| self.brain.train(x, y) | |
| def run_simulation(agent, solved_reward_level): | |
| state = env.reset() | |
| total_rewards = 0 | |
| while True: | |
| # env.render() | |
| action = agent.act(state.astype(np.float32)) | |
| resultant_state, reward, done, info = env.step(action) | |
| if done: # terminal state | |
| resultant_state = None | |
| agent.observe((state, action, reward, resultant_state)) | |
| agent.replay() | |
| state = resultant_state | |
| total_rewards += reward | |
| if total_rewards > solved_reward_level or done: | |
| return total_rewards | |
| def test(model_path, num_episodes=10): | |
| root = cntk.load_model(model_path) | |
| observation = env.reset() # reset environment for new episode | |
| done = False | |
| for episode in range(num_episodes): | |
| while not done: | |
| try: | |
| env.render() | |
| except Exception: | |
| # this might fail on a VM without OpenGL | |
| pass | |
| action = np.argmax(root.eval(observation.astype(np.float32))) | |
| observation, reward, done, info = env.step(action) | |
| if done: | |
| observation = env.reset() # reset environment for new episode | |
| if __name__ == "__main__": | |
| # Ensure we always get the same amount of randomness | |
| np.random.seed(0) | |
| MAX_NUM_EPISODES = 3000 | |
| STREAK_TO_END = 120 | |
| DONE_REWARD_LEVEL = 196 | |
| TRAINED_MODEL_DIR = os.path.join(os.getcwd(), "trained_models") | |
| if not os.path.exists(TRAINED_MODEL_DIR): | |
| os.makedirs(TRAINED_MODEL_DIR) | |
| TRAINED_MODEL_NAME = "cart_pole_dpn.mod" | |
| EPISODES_PER_PRINT_PROGRESS = 50 | |
| if len(sys.argv) < 2 or sys.argv[1] != "test_only": | |
| agent = Agent() | |
| episode_number = 0 | |
| num_streaks = 0 | |
| reward_sum = 0 | |
| solved_episode = -1 | |
| training_start_time = perf_counter() | |
| while episode_number < MAX_NUM_EPISODES: | |
| # Run the simulation and train the agent | |
| reward = run_simulation(agent, DONE_REWARD_LEVEL*2) | |
| reward_sum += reward | |
| episode_number += 1 | |
| if episode_number % EPISODES_PER_PRINT_PROGRESS == 0: | |
| t = perf_counter() - training_start_time | |
| print("(%d s) Episode: %d, Average reward = %f." % (t, episode_number, reward_sum / EPISODES_PER_PRINT_PROGRESS)) | |
| reward_sum = 0 | |
| # It is considered solved when the sum of reward is over 200 | |
| if reward > DONE_REWARD_LEVEL: | |
| num_streaks += 1 | |
| solved_episode = episode_number | |
| else: | |
| num_streaks = 0 | |
| solved_episode = -1 | |
| # It's considered done when it's solved over 120 times consecutively | |
| if num_streaks > STREAK_TO_END: | |
| print("Task solved in %d episodes and repeated %d times." % (episode_number, num_streaks)) | |
| break | |
| agent.brain.model.save_model(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), False) | |
| # testing the model | |
| test(os.path.join(TRAINED_MODEL_DIR, TRAINED_MODEL_NAME), num_episodes=10) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gym.openai.com/evaluations/eval_1HLi0RbyRHCBjn12WiIPeA