Last active
March 4, 2017 11:57
-
-
Save pilipolio/3732a420de21778392328d66d10e5778 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import gym\n", | |
"import ppaquette_gym_doom\n", | |
"from ppaquette_gym_doom.wrappers import SetResolution, ToDiscrete\n", | |
"from gym.wrappers import SkipWrapper\n", | |
"from gym import wrappers\n", | |
"\n", | |
"# (see https://github.com/ppaquette/gym-doom/blob/master/ppaquette_gym_doom/doom_basic.py)\n", | |
"def create_env(seed=None):\n", | |
" env_spec = gym.spec('ppaquette/DoomBasic-v0')\n", | |
" env_spec.id = 'DoomBasic-v0'\n", | |
" env = env_spec.make()\n", | |
"\n", | |
" if seed is not None:\n", | |
" env.seed(seed)\n", | |
"\n", | |
" return SetResolution('200x150')(\n", | |
" SkipWrapper(repeat_count=4)(\n", | |
" ToDiscrete('minimal')(env)))\n", | |
"\n", | |
"env = create_env()\n", | |
"WIDTH, HEIGHT = env.screen_width, env.screen_height\n", | |
"\n", | |
"NOOP, SHOOT, RIGHT, LEFT = 0, 1, 2, 3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Collecting experiences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import namedtuple\n", | |
"import operator\n", | |
"\n", | |
"SARE = namedtuple('SAR', ['state', 'action', 'reward', 'end'])\n", | |
"\n", | |
"\n", | |
"def generate_sares(env, agent, episode_count=100):\n", | |
" reward = 0\n", | |
" done = False\n", | |
"\n", | |
" for i in range(episode_count):\n", | |
" observation = env.reset()\n", | |
" while True:\n", | |
" action = agent.act(observation, reward, done)\n", | |
" new_observation, reward, done, _ = env.step(action)\n", | |
" yield SARE(observation, action, reward, done)\n", | |
" \n", | |
" if done:\n", | |
" break\n", | |
" else:\n", | |
" observation = new_observation\n", | |
"\n", | |
"def episode_sares(env, agent, episode_count=100):\n", | |
" sares = list(generate_sares(env, agent, episode_count))\n", | |
" print('average reward per episode = {}'.format(\n", | |
" sum(r for _, _, r, _ in sares) / float(sum(e for _, _, _, e in sares))))\n", | |
" return sares\n", | |
"\n", | |
" \n", | |
"def to_experiences(sares, only_n_misses=100):\n", | |
" experiences = [\n", | |
" (previous_s, a, r, next_s, end)\n", | |
" for (previous_s, a, r, end), (next_s, _, _, _) in zip(sares[:-1], sares[1:])\n", | |
" ]\n", | |
"\n", | |
" # simplistic experience prioritization\n", | |
" shuffled_exps = experiences if only_n_misses is None\\\n", | |
" else random.choices(experiences, k=only_n_misses) + [e for e in experiences if e[2] > 0]\n", | |
" random.shuffle(shuffled_exps)\n", | |
"\n", | |
" prev_frames, actions, rewards, next_frames, is_ends = zip(*shuffled_exps)\n", | |
" prev_frames = np.asarray(prev_frames)\n", | |
" next_frames = np.asarray(next_frames)\n", | |
" actions = np.asarray(actions)\n", | |
" rewards = np.asarray(rewards)\n", | |
" is_ends = np.asarray(is_ends)\n", | |
" \n", | |
" print('Training on {}/{} positive/total out of {} 1-step experiences with actions distribution {}'.format(\n", | |
" np.sum(rewards>=0),\n", | |
" len(rewards),\n", | |
" len(experiences),\n", | |
" np.bincount(actions)))\n", | |
" \n", | |
" return (prev_frames, next_frames, actions, rewards, is_ends)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Deep Q-learning" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using TensorFlow backend.\n" | |
] | |
} | |
], | |
"source": [ | |
"import tensorflow as tf\n", | |
"from keras import backend as K\n", | |
"\n", | |
"from keras.layers import Dense, Convolution2D, Flatten, Activation\n", | |
"from keras.models import Sequential\n", | |
"from keras.optimizers import Adam\n", | |
"\n", | |
"sess = tf.InteractiveSession()\n", | |
"K.set_session(sess)\n", | |
"\n", | |
"def create_q_model(conv1_weights=None, conv2_weights=None, dense1_weights=None, dense2_weights=None):\n", | |
" model = Sequential()\n", | |
"\n", | |
" model.add(Convolution2D(\n", | |
" 2, nb_row=6, nb_col=6, border_mode='valid', weights=conv1_weights,\n", | |
" input_shape=[HEIGHT, WIDTH, 3], dim_ordering='tf'))\n", | |
" model.add(Activation('relu'))\n", | |
" model.add(Convolution2D(4, nb_row=2, nb_col=2, weights=conv2_weights))\n", | |
" model.add(Activation('relu'))\n", | |
" model.add(Flatten())\n", | |
" model.add(Dense(64, init='normal', weights=dense1_weights))\n", | |
" model.add(Activation('relu'))\n", | |
" model.add(Dense(4, init='normal', weights=dense2_weights))\n", | |
" model.compile(loss='mse', optimizer=Adam())\n", | |
" \n", | |
" return model\n", | |
"\n", | |
"acting_model = create_q_model()\n", | |
"target_model = create_q_model()\n", | |
"\n", | |
"def copy_model(model):\n", | |
" conv1_weights = [w.eval() for w in model.layers[0].weights]\n", | |
" conv2_weights = [w.eval() for w in model.layers[2].weights]\n", | |
" dense1_weights = [w.eval() for w in model.layers[5].weights]\n", | |
" dense2_weights = [w.eval() for w in model.layers[7].weights]\n", | |
" return create_q_model(conv1_weights, conv2_weights, dense1_weights, dense2_weights)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def sares_to_input_targets(model, sares, gamma = .99, reward_clip=5, only_n_misses=100):\n", | |
" \n", | |
" prev_frames, next_frames, actions, rewards, is_ends = to_experiences(sares, only_n_misses)\n", | |
" \n", | |
" n_samples = len(actions)\n", | |
" clipped_rewards = np.clip(rewards, -np.inf, reward_clip)\n", | |
" \n", | |
" # Transcription of the Q-learning target formula\n", | |
" targets = clipped_rewards + gamma * (1 - is_ends) * model.predict(next_frames).max(axis=1)\n", | |
"\n", | |
" target_action_rewards = model.predict(prev_frames)\n", | |
" target_action_rewards[np.arange(n_samples), actions] = targets\n", | |
"\n", | |
" return prev_frames, target_action_rewards" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Training" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:44:01,389] DEPRECATION WARNING: env.spec.timestep_limit has been deprecated. Replace your call to `env.spec.timestep_limit` with `env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')`. This change was made 12/28/2016 and is included in version 0.7.0\n", | |
"[2017-03-04 11:44:01,390] Clearing 27 monitor files from previous run (because force=True was provided)\n", | |
"[2017-03-04 11:44:01,764] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000000.mp4\n", | |
"[2017-03-04 11:44:02,552] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000001.mp4\n", | |
"[2017-03-04 11:44:05,313] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000008.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -295.0\n", | |
"Training on 4/104 positive/total out of 499 1-step experiences with actions distribution [ 2 91 5 6]\n", | |
"average reward per episode = -173.8\n", | |
"Training on 6/106 positive/total out of 349 1-step experiences with actions distribution [ 4 96 5 1]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:44:17,532] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000027.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -301.7\n", | |
"Training on 3/103 positive/total out of 507 1-step experiences with actions distribution [ 4 95 1 3]\n", | |
"average reward per episode = -186.5\n", | |
"Training on 5/105 positive/total out of 369 1-step experiences with actions distribution [ 2 99 2 2]\n", | |
"average reward per episode = -344.1\n", | |
"Training on 3/103 positive/total out of 572 1-step experiences with actions distribution [ 3 95 2 3]\n", | |
"average reward per episode = -283.2\n", | |
"Training on 4/104 positive/total out of 480 1-step experiences with actions distribution [ 4 96 2 2]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:44:42,014] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000064.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -212.8\n", | |
"Training on 5/104 positive/total out of 393 1-step experiences with actions distribution [ 2 97 3 2]\n", | |
"average reward per episode = -238.4\n", | |
"Training on 6/105 positive/total out of 432 1-step experiences with actions distribution [ 4 96 1 4]\n", | |
"average reward per episode = -282.7\n", | |
"Training on 4/104 positive/total out of 478 1-step experiences with actions distribution [ 1 99 2 2]\n", | |
"average reward per episode = -243.1\n", | |
"Training on 5/105 positive/total out of 437 1-step experiences with actions distribution [ 0 100 2 3]\n", | |
"average reward per episode = -326.9\n", | |
"Training on 4/103 positive/total out of 546 1-step experiences with actions distribution [ 1 98 2 2]\n", | |
"average reward per episode = -173.7\n", | |
"Training on 7/105 positive/total out of 349 1-step experiences with actions distribution [ 4 98 2 1]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:45:20,178] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000125.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -328.8\n", | |
"Training on 4/104 positive/total out of 547 1-step experiences with actions distribution [ 2 96 3 3]\n", | |
"average reward per episode = -319.3\n", | |
"Training on 4/103 positive/total out of 533 1-step experiences with actions distribution [ 1 97 2 3]\n", | |
"average reward per episode = -284.6\n", | |
"Training on 4/104 positive/total out of 483 1-step experiences with actions distribution [ 1 99 0 4]\n", | |
"average reward per episode = -250.8\n", | |
"Training on 5/104 positive/total out of 432 1-step experiences with actions distribution [ 2 99 2 1]\n", | |
"average reward per episode = -362.6\n", | |
"Training on 2/102 positive/total out of 567 1-step experiences with actions distribution [ 2 96 4]\n", | |
"average reward per episode = -117.1\n", | |
"Training on 7/107 positive/total out of 282 1-step experiences with actions distribution [ 0 98 2 7]\n", | |
"average reward per episode = -343.6\n", | |
"Training on 2/102 positive/total out of 555 1-step experiences with actions distribution [ 3 96 2 1]\n", | |
"average reward per episode = -226.5\n", | |
"Training on 5/105 positive/total out of 412 1-step experiences with actions distribution [ 2 98 2 3]\n", | |
"average reward per episode = -333.1\n", | |
"Training on 3/103 positive/total out of 540 1-step experiences with actions distribution [ 2 92 5 4]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:46:19,153] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000216.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -250.3\n", | |
"Training on 6/105 positive/total out of 447 1-step experiences with actions distribution [ 2 93 8 2]\n", | |
"average reward per episode = -281.4\n", | |
"Training on 3/103 positive/total out of 480 1-step experiences with actions distribution [ 6 93 3 1]\n", | |
"average reward per episode = -297.9\n", | |
"Training on 4/104 positive/total out of 502 1-step experiences with actions distribution [ 4 87 10 3]\n", | |
"average reward per episode = -415.4\n", | |
"Training on 1/101 positive/total out of 630 1-step experiences with actions distribution [ 1 92 7 1]\n", | |
"average reward per episode = -394.4\n", | |
"Training on 1/101 positive/total out of 615 1-step experiences with actions distribution [ 5 93 0 3]\n", | |
"average reward per episode = -200.4\n", | |
"Training on 7/105 positive/total out of 373 1-step experiences with actions distribution [ 5 91 3 6]\n", | |
"average reward per episode = -87.6\n", | |
"Training on 7/106 positive/total out of 237 1-step experiences with actions distribution [ 4 101 0 1]\n", | |
"average reward per episode = -255.3\n", | |
"Training on 5/104 positive/total out of 440 1-step experiences with actions distribution [ 3 95 3 3]\n", | |
"average reward per episode = -301.7\n", | |
"Training on 3/102 positive/total out of 492 1-step experiences with actions distribution [ 0 98 2 2]\n", | |
"average reward per episode = -170.0\n", | |
"Training on 6/106 positive/total out of 342 1-step experiences with actions distribution [ 0 99 3 4]\n", | |
"average reward per episode = -294.0\n", | |
"Training on 4/104 positive/total out of 513 1-step experiences with actions distribution [ 3 93 4 4]\n", | |
"average reward per episode = -343.5\n", | |
"Training on 4/103 positive/total out of 557 1-step experiences with actions distribution [ 6 94 1 2]\n", | |
"average reward per episode = -171.3\n", | |
"Training on 8/105 positive/total out of 347 1-step experiences with actions distribution [ 5 91 6 3]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:47:45,261] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000343.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -155.1\n", | |
"Training on 11/107 positive/total out of 340 1-step experiences with actions distribution [ 2 102 2 1]\n", | |
"average reward per episode = -388.8\n", | |
"Training on 2/101 positive/total out of 604 1-step experiences with actions distribution [ 2 95 2 2]\n", | |
"average reward per episode = -403.0\n", | |
"Training on 1/101 positive/total out of 628 1-step experiences with actions distribution [ 4 91 2 4]\n", | |
"average reward per episode = -331.6\n", | |
"Training on 2/102 positive/total out of 537 1-step experiences with actions distribution [ 2 97 1 2]\n", | |
"average reward per episode = -425.8\n", | |
"Training on 4/102 positive/total out of 661 1-step experiences with actions distribution [ 4 94 3 1]\n", | |
"average reward per episode = -205.2\n", | |
"Training on 8/106 positive/total out of 396 1-step experiences with actions distribution [ 5 98 1 2]\n", | |
"average reward per episode = -157.5\n", | |
"Training on 5/105 positive/total out of 324 1-step experiences with actions distribution [ 3 96 2 4]\n", | |
"average reward per episode = -328.3\n", | |
"Training on 4/103 positive/total out of 547 1-step experiences with actions distribution [ 2 95 2 4]\n", | |
"average reward per episode = -378.3\n", | |
"Training on 3/103 positive/total out of 608 1-step experiences with actions distribution [ 6 94 1 2]\n", | |
"average reward per episode = -328.8\n", | |
"Training on 4/104 positive/total out of 550 1-step experiences with actions distribution [ 3 97 3 1]\n", | |
"average reward per episode = -111.7\n", | |
"Training on 7/106 positive/total out of 273 1-step experiences with actions distribution [ 3 96 5 2]\n", | |
"average reward per episode = -227.4\n", | |
"Training on 5/105 positive/total out of 414 1-step experiences with actions distribution [ 0 96 4 5]\n", | |
"average reward per episode = -102.0\n", | |
"Training on 12/106 positive/total out of 258 1-step experiences with actions distribution [ 2 100 4]\n", | |
"average reward per episode = -255.5\n", | |
"Training on 4/104 positive/total out of 440 1-step experiences with actions distribution [ 4 94 3 3]\n", | |
"average reward per episode = -304.3\n", | |
"Training on 4/104 positive/total out of 514 1-step experiences with actions distribution [ 5 95 0 4]\n", | |
"average reward per episode = -330.7\n", | |
"Training on 4/104 positive/total out of 550 1-step experiences with actions distribution [ 4 92 6 2]\n", | |
"average reward per episode = -180.4\n", | |
"Training on 6/105 positive/total out of 360 1-step experiences with actions distribution [ 1 98 2 4]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:49:41,426] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000512.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -396.8\n", | |
"Training on 4/103 positive/total out of 634 1-step experiences with actions distribution [ 1 100 2]\n", | |
"average reward per episode = -230.3\n", | |
"Training on 6/104 positive/total out of 419 1-step experiences with actions distribution [ 1 97 4 2]\n", | |
"average reward per episode = -331.2\n", | |
"Training on 3/103 positive/total out of 536 1-step experiences with actions distribution [ 3 94 4 2]\n", | |
"average reward per episode = -232.8\n", | |
"Training on 5/104 positive/total out of 421 1-step experiences with actions distribution [ 2 99 3]\n", | |
"average reward per episode = -93.8\n", | |
"Training on 6/106 positive/total out of 247 1-step experiences with actions distribution [ 2 98 3 3]\n", | |
"average reward per episode = -82.3\n", | |
"Training on 13/107 positive/total out of 244 1-step experiences with actions distribution [ 2 104 0 1]\n", | |
"average reward per episode = -81.9\n", | |
"Training on 8/107 positive/total out of 246 1-step experiences with actions distribution [ 2 100 3 2]\n", | |
"average reward per episode = -286.0\n", | |
"Training on 3/103 positive/total out of 485 1-step experiences with actions distribution [ 1 98 2 2]\n", | |
"average reward per episode = -206.0\n", | |
"Training on 7/106 positive/total out of 398 1-step experiences with actions distribution [ 0 102 1 3]\n", | |
"average reward per episode = -149.5\n", | |
"Training on 6/106 positive/total out of 330 1-step experiences with actions distribution [ 5 93 5 3]\n", | |
"average reward per episode = -276.5\n", | |
"Training on 5/104 positive/total out of 469 1-step experiences with actions distribution [ 1 101 0 2]\n", | |
"average reward per episode = -196.6\n", | |
"Training on 7/105 positive/total out of 367 1-step experiences with actions distribution [ 3 98 1 3]\n", | |
"average reward per episode = -192.5\n", | |
"Training on 9/107 positive/total out of 395 1-step experiences with actions distribution [ 2 100 3 2]\n", | |
"average reward per episode = -286.9\n", | |
"Training on 6/104 positive/total out of 486 1-step experiences with actions distribution [ 2 101 1]\n", | |
"average reward per episode = -303.1\n", | |
"Training on 6/104 positive/total out of 511 1-step experiences with actions distribution [ 3 94 2 5]\n", | |
"average reward per episode = -338.8\n", | |
"Training on 5/103 positive/total out of 551 1-step experiences with actions distribution [ 2 95 4 2]\n", | |
"average reward per episode = -229.3\n", | |
"Training on 5/105 positive/total out of 433 1-step experiences with actions distribution [ 4 95 1 5]\n", | |
"average reward per episode = -149.4\n", | |
"Training on 12/106 positive/total out of 331 1-step experiences with actions distribution [ 3 96 1 6]\n", | |
"average reward per episode = -216.5\n", | |
"Training on 5/104 positive/total out of 398 1-step experiences with actions distribution [ 2 97 2 3]\n", | |
"average reward per episode = -311.6\n", | |
"Training on 4/104 positive/total out of 541 1-step experiences with actions distribution [ 4 94 4 2]\n", | |
"average reward per episode = -309.3\n", | |
"Training on 3/102 positive/total out of 504 1-step experiences with actions distribution [ 2 97 2 1]\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[2017-03-04 11:52:07,127] Starting new video recorder writing to /Users/gui/Dev/rl-study/tmp/q_learning/openaigym.video.0.48905.video000729.mp4\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average reward per episode = -343.0\n", | |
"Training on 3/103 positive/total out of 555 1-step experiences with actions distribution [ 3 91 2 7]\n", | |
"average reward per episode = -357.0\n", | |
"Training on 0/100 positive/total out of 699 1-step experiences with actions distribution [93 2 3 2]\n", | |
"average reward per episode = -316.3\n", | |
"Training on 2/102 positive/total out of 659 1-step experiences with actions distribution [92 4 4 2]\n", | |
"average reward per episode = -358.5\n", | |
"Training on 0/100 positive/total out of 699 1-step experiences with actions distribution [95 0 1 4]\n" | |
] | |
} | |
], | |
"source": [ | |
"import random\n", | |
"\n", | |
"class EpsilonGreedyQAgent(object):\n", | |
" def __init__(self, model, epsilon=.1):\n", | |
" self.model = model\n", | |
" self.epsilon = epsilon\n", | |
"\n", | |
" def act(self, observation, reward, done):\n", | |
" if random.uniform(0, 1) <= self.epsilon:\n", | |
" return random.choice([NOOP, SHOOT, LEFT, RIGHT])\n", | |
" else:\n", | |
" return self.model.predict(observation[np.newaxis])[0].argmax()\n", | |
"\n", | |
"\n", | |
"N_BATCHES = 100\n", | |
"N_BATCHED_EPISODES = 10\n", | |
"UPDATE_TARGET_EVERY_N_BACTHES = 2\n", | |
"MINI_BATCH_SIZE = 32\n", | |
"REWARD_CLIP = 5\n", | |
"ONLY_N_MISSES = 200\n", | |
"\n", | |
"env = create_env()\n", | |
"env = wrappers.Monitor(env, directory='tmp/q_learning', force=True, mode='training')\n", | |
"\n", | |
"for _ in range(N_BATCHES):\n", | |
" for _ in range(UPDATE_TARGET_EVERY_N_BACTHES):\n", | |
" sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=.1), N_BATCHED_EPISODES)\n", | |
" prev_frames, target_action_rewards = sares_to_input_targets(target_model, sares, reward_clip=REWARD_CLIP, only_n_misses=ONLY_N_MISSES)\n", | |
" acting_model.fit(x=prev_frames, y=target_action_rewards, batch_size=MINI_BATCH_SIZE, nb_epoch=1, verbose=0)\n", | |
" \n", | |
" target_model = copy_model(acting_model)\n", | |
"\n", | |
"\n", | |
"# final greedy episodes\n", | |
"sares = episode_sares(env, EpsilonGreedyQAgent(acting_model, epsilon=0), episode_count=1000)\n", | |
"\n", | |
"plt.plot(np.cumsum(list(map(operator.attrgetter('reward'), sares))));\n", | |
"plt.xlabel('steps'); plt.ylabel('Cumulated rewards');\n", | |
"\n", | |
"env.close()\n", | |
"gym.upload('tmp/q_learning', api_key='sk_bNZUvCfkTfabQCoKoKbjFA')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"model_name = 'DoomBasic-v0_res=200x150_skip=4_discrete=minimal_fc64'\n", | |
"\n", | |
"acting_model.save(model_name + '.h5')\n", | |
"\n", | |
"with open(model_name + '.json', 'w+') as f:\n", | |
" json.dump(acting_model.to_json(), f)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment