awjuliani · July 14, 2019 16:24 · mphielipp · Jan 13, 2017 · irwenqiang · Jan 17, 2017
diff --git a/Double-Dueling-DQN-Tutorial.ipynb b/Double-Dueling-DQN-Tutorial.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple Reinforcement Learning with Tensorflow Part 4: Deep Q-Networks and Beyond\n",
    "\n",
    "In this iPython notebook I implement a Deep Q-Network using both Double DQN and Dueling DQN. The agent learn to solve a navigation task in a basic grid world. To learn more, read here: https://medium.com/p/8438a3e2b8df\n",
    "\n",
    "For more reinforcment learning tutorials, as well as required gridworld.py file, see:\n",
    "https://github.com/awjuliani/DeepRL-Agents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gym\n",
    "import numpy as np\n",
    "import random\n",
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.misc\n",
    "import os\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the game environment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feel free to adjust the size of the gridworld. Making it smaller provides an easier task for our DQN agent, while making the world larger increases the challenge."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from gridworld import gameEnv\n",
    "\n",
    "env = gameEnv(partial=False,size=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Above is an example of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green square (for +1 reward) and avoid the red square (for -1 reward). The position of the three blocks is randomized every episode."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Implementing the network itself"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class Qnetwork():\n",
    "    def __init__(self,h_size):\n",
    "        #The network recieves a frame from the game, flattened into an array.\n",
    "        #It then resizes it and processes it through four convolutional layers.\n",
    "        self.scalarInput =  tf.placeholder(shape=[None,21168],dtype=tf.float32)\n",
    "        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])\n",
    "        self.conv1 = tf.contrib.layers.convolution2d( \\\n",
    "            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)\n",
    "        self.conv2 = tf.contrib.layers.convolution2d( \\\n",
    "            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)\n",
    "        self.conv3 = tf.contrib.layers.convolution2d( \\\n",
    "            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)\n",
    "        self.conv4 = tf.contrib.layers.convolution2d( \\\n",
    "            inputs=self.conv3,num_outputs=512,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None)\n",
    "        \n",
    "        #We take the output from the final convolutional layer and split it into separate advantage and value streams.\n",
    "        self.streamAC,self.streamVC = tf.split(3,2,self.conv4)\n",
    "        self.streamA = tf.contrib.layers.flatten(self.streamAC)\n",
    "        self.streamV = tf.contrib.layers.flatten(self.streamVC)\n",
    "        self.AW = tf.Variable(tf.random_normal([h_size/2,env.actions]))\n",
    "        self.VW = tf.Variable(tf.random_normal([h_size/2,1]))\n",
    "        self.Advantage = tf.matmul(self.streamA,self.AW)\n",
    "        self.Value = tf.matmul(self.streamV,self.VW)\n",
    "        \n",
    "        #Then combine them together to get our final Q-values.\n",
    "        self.Qout = self.Value + tf.sub(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))\n",
    "        self.predict = tf.argmax(self.Qout,1)\n",
    "        \n",
    "        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.\n",
    "        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)\n",
    "        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)\n",
    "        self.actions_onehot = tf.one_hot(self.actions,env.actions,dtype=tf.float32)\n",
    "        \n",
    "        self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)\n",
    "        \n",
    "        self.td_error = tf.square(self.targetQ - self.Q)\n",
    "        self.loss = tf.reduce_mean(self.td_error)\n",
    "        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)\n",
    "        self.updateModel = self.trainer.minimize(self.loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Experience Replay"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This class allows us to store experies and sample then randomly to train the network."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class experience_buffer():\n",
    "    def __init__(self, buffer_size = 50000):\n",
    "        self.buffer = []\n",
    "        self.buffer_size = buffer_size\n",
    "    \n",
    "    def add(self,experience):\n",
    "        if len(self.buffer) + len(experience) >= self.buffer_size:\n",
    "            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []\n",
    "        self.buffer.extend(experience)\n",
    "            \n",
    "    def sample(self,size):\n",
    "        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is a simple function to resize our game frames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def processState(states):\n",
    "    return np.reshape(states,[21168])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These functions allow us to update the parameters of our target network with those of the primary network."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def updateTargetGraph(tfVars,tau):\n",
    "    total_vars = len(tfVars)\n",
    "    op_holder = []\n",
    "    for idx,var in enumerate(tfVars[0:total_vars/2]):\n",
    "        op_holder.append(tfVars[idx+total_vars/2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars/2].value())))\n",
    "    return op_holder\n",
    "\n",
    "def updateTarget(op_holder,sess):\n",
    "    for op in op_holder:\n",
    "        sess.run(op)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training the network"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Setting all the training parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size = 32 #How many experiences to use for each training step.\n",
    "update_freq = 4 #How often to perform a training step.\n",
    "y = .99 #Discount factor on the target Q-values\n",
    "startE = 1 #Starting chance of random action\n",
    "endE = 0.1 #Final chance of random action\n",
    "anneling_steps = 10000. #How many steps of training to reduce startE to endE.\n",
    "num_episodes = 10000 #How many episodes of game environment to train network with.\n",
    "pre_train_steps = 10000 #How many steps of random actions before training begins.\n",
    "max_epLength = 50 #The max allowed length of our episode.\n",
    "load_model = False #Whether to load a saved model.\n",
    "path = \"./dqn\" #The path to save our model to.\n",
    "h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
    "tau = 0.001 #Rate to update target network toward primary network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "mainQN = Qnetwork(h_size)\n",
    "targetQN = Qnetwork(h_size)\n",
    "\n",
    "init = tf.initialize_all_variables()\n",
    "\n",
    "saver = tf.train.Saver()\n",
    "\n",
    "trainables = tf.trainable_variables()\n",
    "\n",
    "targetOps = updateTargetGraph(trainables,tau)\n",
    "\n",
    "myBuffer = experience_buffer()\n",
    "\n",
    "#Set the rate of random action decrease. \n",
    "e = startE\n",
    "stepDrop = (startE - endE)/anneling_steps\n",
    "\n",
    "#create lists to contain total rewards and steps per episode\n",
    "jList = []\n",
    "rList = []\n",
    "total_steps = 0\n",
    "\n",
    "#Make a path for our model to be saved in.\n",
    "if not os.path.exists(path):\n",
    "    os.makedirs(path)\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    if load_model == True:\n",
    "        print 'Loading Model...'\n",
    "        ckpt = tf.train.get_checkpoint_state(path)\n",
    "        saver.restore(sess,ckpt.model_checkpoint_path)\n",
    "    sess.run(init)\n",
    "    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
    "    for i in range(num_episodes):\n",
    "        episodeBuffer = experience_buffer()\n",
    "        #Reset environment and get first new observation\n",
    "        s = env.reset()\n",
    "        s = processState(s)\n",
    "        d = False\n",
    "        rAll = 0\n",
    "        j = 0\n",
    "        #The Q-Network\n",
    "        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.\n",
    "            j+=1\n",
    "            #Choose an action by greedily (with e chance of random action) from the Q-network\n",
    "            if np.random.rand(1) < e or total_steps < pre_train_steps:\n",
    "                a = np.random.randint(0,4)\n",
    "            else:\n",
    "                a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0]\n",
    "            s1,r,d = env.step(a)\n",
    "            s1 = processState(s1)\n",
    "            total_steps += 1\n",
    "            episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.\n",
    "            \n",
    "            if total_steps > pre_train_steps:\n",
    "                if e > endE:\n",
    "                    e -= stepDrop\n",
    "                \n",
    "                if total_steps % (update_freq) == 0:\n",
    "                    trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.\n",
    "                    #Below we perform the Double-DQN update to the target Q-values\n",
    "                    Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})\n",
    "                    Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})\n",
    "                    end_multiplier = -(trainBatch[:,4] - 1)\n",
    "                    doubleQ = Q2[range(batch_size),Q1]\n",
    "                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)\n",
    "                    #Update the network with our target values.\n",
    "                    _ = sess.run(mainQN.updateModel, \\\n",
    "                        feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})\n",
    "                    \n",
    "                    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
    "            rAll += r\n",
    "            s = s1\n",
    "            \n",
    "            if d == True:\n",
    "\n",
    "                break\n",
    "        \n",
    "        myBuffer.add(episodeBuffer.buffer)\n",
    "        jList.append(j)\n",
    "        rList.append(rAll)\n",
    "        #Periodically save the model. \n",
    "        if i % 1000 == 0:\n",
    "            saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
    "            print \"Saved Model\"\n",
    "        if len(rList) % 10 == 0:\n",
    "            print total_steps,np.mean(rList[-10:]), e\n",
    "    saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
    "print \"Percent of succesful episodes: \" + str(sum(rList)/num_episodes) + \"%\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checking network learning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Mean reward over time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rMat = np.resize(np.array(rList),[len(rList)/100,100])\n",
    "rMean = np.average(rMat,1)\n",
    "plt.plot(rMean)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Simple Reinforcement Learning with Tensorflow Part 4: Deep Q-Networks and Beyond\n",
	"\n",
	"In this iPython notebook I implement a Deep Q-Network using both Double DQN and Dueling DQN. The agent learn to solve a navigation task in a basic grid world. To learn more, read here: https://medium.com/p/8438a3e2b8df\n",
	"\n",
	"For more reinforcment learning tutorials, as well as required gridworld.py file, see:\n",
	"https://github.com/awjuliani/DeepRL-Agents"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import gym\n",
	"import numpy as np\n",
	"import random\n",
	"import tensorflow as tf\n",
	"import matplotlib.pyplot as plt\n",
	"import scipy.misc\n",
	"import os\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load the game environment"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Feel free to adjust the size of the gridworld. Making it smaller provides an easier task for our DQN agent, while making the world larger increases the challenge."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"from gridworld import gameEnv\n",
	"\n",
	"env = gameEnv(partial=False,size=5)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Above is an example of a starting environment in our simple game. The agent controls the blue square, and can move up, down, left, or right. The goal is to move to the green square (for +1 reward) and avoid the red square (for -1 reward). The position of the three blocks is randomized every episode."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Implementing the network itself"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"class Qnetwork():\n",
	" def __init__(self,h_size):\n",
	" #The network recieves a frame from the game, flattened into an array.\n",
	" #It then resizes it and processes it through four convolutional layers.\n",
	" self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32)\n",
	" self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3])\n",
	" self.conv1 = tf.contrib.layers.convolution2d( \\\n",
	" inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)\n",
	" self.conv2 = tf.contrib.layers.convolution2d( \\\n",
	" inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)\n",
	" self.conv3 = tf.contrib.layers.convolution2d( \\\n",
	" inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)\n",
	" self.conv4 = tf.contrib.layers.convolution2d( \\\n",
	" inputs=self.conv3,num_outputs=512,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None)\n",
	" \n",
	" #We take the output from the final convolutional layer and split it into separate advantage and value streams.\n",
	" self.streamAC,self.streamVC = tf.split(3,2,self.conv4)\n",
	" self.streamA = tf.contrib.layers.flatten(self.streamAC)\n",
	" self.streamV = tf.contrib.layers.flatten(self.streamVC)\n",
	" self.AW = tf.Variable(tf.random_normal([h_size/2,env.actions]))\n",
	" self.VW = tf.Variable(tf.random_normal([h_size/2,1]))\n",
	" self.Advantage = tf.matmul(self.streamA,self.AW)\n",
	" self.Value = tf.matmul(self.streamV,self.VW)\n",
	" \n",
	" #Then combine them together to get our final Q-values.\n",
	" self.Qout = self.Value + tf.sub(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))\n",
	" self.predict = tf.argmax(self.Qout,1)\n",
	" \n",
	" #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.\n",
	" self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)\n",
	" self.actions = tf.placeholder(shape=[None],dtype=tf.int32)\n",
	" self.actions_onehot = tf.one_hot(self.actions,env.actions,dtype=tf.float32)\n",
	" \n",
	" self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)\n",
	" \n",
	" self.td_error = tf.square(self.targetQ - self.Q)\n",
	" self.loss = tf.reduce_mean(self.td_error)\n",
	" self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)\n",
	" self.updateModel = self.trainer.minimize(self.loss)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Experience Replay"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This class allows us to store experies and sample then randomly to train the network."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class experience_buffer():\n",
	" def __init__(self, buffer_size = 50000):\n",
	" self.buffer = []\n",
	" self.buffer_size = buffer_size\n",
	" \n",
	" def add(self,experience):\n",
	" if len(self.buffer) + len(experience) >= self.buffer_size:\n",
	" self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []\n",
	" self.buffer.extend(experience)\n",
	" \n",
	" def sample(self,size):\n",
	" return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This is a simple function to resize our game frames."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def processState(states):\n",
	" return np.reshape(states,[21168])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"These functions allow us to update the parameters of our target network with those of the primary network."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def updateTargetGraph(tfVars,tau):\n",
	" total_vars = len(tfVars)\n",
	" op_holder = []\n",
	" for idx,var in enumerate(tfVars[0:total_vars/2]):\n",
	" op_holder.append(tfVars[idx+total_vars/2].assign((var.value()tau) + ((1-tau)tfVars[idx+total_vars/2].value())))\n",
	" return op_holder\n",
	"\n",
	"def updateTarget(op_holder,sess):\n",
	" for op in op_holder:\n",
	" sess.run(op)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Training the network"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Setting all the training parameters"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"batch_size = 32 #How many experiences to use for each training step.\n",
	"update_freq = 4 #How often to perform a training step.\n",
	"y = .99 #Discount factor on the target Q-values\n",
	"startE = 1 #Starting chance of random action\n",
	"endE = 0.1 #Final chance of random action\n",
	"anneling_steps = 10000. #How many steps of training to reduce startE to endE.\n",
	"num_episodes = 10000 #How many episodes of game environment to train network with.\n",
	"pre_train_steps = 10000 #How many steps of random actions before training begins.\n",
	"max_epLength = 50 #The max allowed length of our episode.\n",
	"load_model = False #Whether to load a saved model.\n",
	"path = \"./dqn\" #The path to save our model to.\n",
	"h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams.\n",
	"tau = 0.001 #Rate to update target network toward primary network"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"tf.reset_default_graph()\n",
	"mainQN = Qnetwork(h_size)\n",
	"targetQN = Qnetwork(h_size)\n",
	"\n",
	"init = tf.initialize_all_variables()\n",
	"\n",
	"saver = tf.train.Saver()\n",
	"\n",
	"trainables = tf.trainable_variables()\n",
	"\n",
	"targetOps = updateTargetGraph(trainables,tau)\n",
	"\n",
	"myBuffer = experience_buffer()\n",
	"\n",
	"#Set the rate of random action decrease. \n",
	"e = startE\n",
	"stepDrop = (startE - endE)/anneling_steps\n",
	"\n",
	"#create lists to contain total rewards and steps per episode\n",
	"jList = []\n",
	"rList = []\n",
	"total_steps = 0\n",
	"\n",
	"#Make a path for our model to be saved in.\n",
	"if not os.path.exists(path):\n",
	" os.makedirs(path)\n",
	"\n",
	"with tf.Session() as sess:\n",
	" if load_model == True:\n",
	" print 'Loading Model...'\n",
	" ckpt = tf.train.get_checkpoint_state(path)\n",
	" saver.restore(sess,ckpt.model_checkpoint_path)\n",
	" sess.run(init)\n",
	" updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
	" for i in range(num_episodes):\n",
	" episodeBuffer = experience_buffer()\n",
	" #Reset environment and get first new observation\n",
	" s = env.reset()\n",
	" s = processState(s)\n",
	" d = False\n",
	" rAll = 0\n",
	" j = 0\n",
	" #The Q-Network\n",
	" while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.\n",
	" j+=1\n",
	" #Choose an action by greedily (with e chance of random action) from the Q-network\n",
	" if np.random.rand(1) < e or total_steps < pre_train_steps:\n",
	" a = np.random.randint(0,4)\n",
	" else:\n",
	" a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0]\n",
	" s1,r,d = env.step(a)\n",
	" s1 = processState(s1)\n",
	" total_steps += 1\n",
	" episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.\n",
	" \n",
	" if total_steps > pre_train_steps:\n",
	" if e > endE:\n",
	" e -= stepDrop\n",
	" \n",
	" if total_steps % (update_freq) == 0:\n",
	" trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.\n",
	" #Below we perform the Double-DQN update to the target Q-values\n",
	" Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})\n",
	" Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})\n",
	" end_multiplier = -(trainBatch[:,4] - 1)\n",
	" doubleQ = Q2[range(batch_size),Q1]\n",
	" targetQ = trainBatch[:,2] + (ydoubleQ end_multiplier)\n",
	" #Update the network with our target values.\n",
	" _ = sess.run(mainQN.updateModel, \\\n",
	" feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})\n",
	" \n",
	" updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.\n",
	" rAll += r\n",
	" s = s1\n",
	" \n",
	" if d == True:\n",
	"\n",
	" break\n",
	" \n",
	" myBuffer.add(episodeBuffer.buffer)\n",
	" jList.append(j)\n",
	" rList.append(rAll)\n",
	" #Periodically save the model. \n",
	" if i % 1000 == 0:\n",
	" saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
	" print \"Saved Model\"\n",
	" if len(rList) % 10 == 0:\n",
	" print total_steps,np.mean(rList[-10:]), e\n",
	" saver.save(sess,path+'/model-'+str(i)+'.cptk')\n",
	"print \"Percent of succesful episodes: \" + str(sum(rList)/num_episodes) + \"%\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Checking network learning"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Mean reward over time"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"rMat = np.resize(np.array(rList),[len(rList)/100,100])\n",
	"rMean = np.average(rMat,1)\n",
	"plt.plot(rMean)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}