tristansokol · May 24, 2018 13:54
diff --git a/first-q-learn.ipynb b/first-q-learn.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Q-learning first attempt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Optionally install what you need:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import sys\n",
    "# !{sys.executable} -m pip install seaborn gym-retro Pillow keras tensorflow opencv-python pandas matplotlib scipy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First we set up all of the imports that we will have with the project, there are a bunch:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-5b9450a8a2b2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpprint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPrettyPrinter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mcv2\u001b[0m \u001b[0;31m#OpenCV\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mgym_remote\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mgrc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Development/Bobcats/reverie_agent/reverie_agent/lib/python3.6/site-packages/cv2/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mimportlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcv2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import random\n",
    "import math\n",
    "import retro\n",
    "from PIL import Image\n",
    "import gym\n",
    "import pickle\n",
    "import operator\n",
    "import pprint\n",
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "import numpy as np\n",
    "import cv2 #OpenCV\n",
    "import time\n",
    "import gym_remote.client as grc\n",
    "import gym_remote.exceptions as gre\n",
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "from IPython.display import clear_output\n",
    "from collections import deque\n",
    "from matplotlib import pyplot as plt\n",
    "plt.rcParams['figure.figsize'] = (30, 30)\n",
    "import seaborn as sns\n",
    "\n",
    "from keras.initializers import normal, identity\n",
    "from keras.models import model_from_json\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation, Flatten\n",
    "from keras.layers.convolutional import Conv2D, MaxPooling2D\n",
    "from keras.optimizers import SGD , Adam\n",
    "import tensorflow as tf\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Helper functions handle disc IO, and help set the stage for recor keeping in a run. `show_img` was useful in rendering a frame of sonic, at various stages in the image processing. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# file system interactors\n",
    "def save_obj(obj, name ):\n",
    "    with open(name + '.pkl', 'wb') as f: #dump files into objects folder\n",
    "        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)\n",
    "\n",
    "def load_obj(name ):\n",
    "    try:\n",
    "        with open(name + '.pkl', 'rb') as f:\n",
    "            return pickle.load(f)\n",
    "    except FileNotFoundError:\n",
    "        if name == 'epsilon':\n",
    "            return .7;\n",
    "        return []\n",
    "    else:\n",
    "        return []\n",
    "    \n",
    "output_dir = './run-'+time.strftime(\"%Y%m%d-%H:%M\")+'/'\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir)\n",
    "loss_file_path = output_dir+\"loss_df.csv\"\n",
    "\n",
    "#Intialize log structures from file if exists else create new#Intiali \n",
    "loss_df = pd.read_csv(loss_file_path) if os.path.isfile(loss_file_path) else pd.DataFrame(columns =['loss'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_img(image,graphs = False):\n",
    "    \"\"\"\n",
    "    Show images in new window\n",
    "    \"\"\"\n",
    "    while True:\n",
    "#         print(image.shape)\n",
    "#         image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)\n",
    "        processed = process_img(image)\n",
    "        window_title = \"logs\" if graphs else \"game_play\"\n",
    "        cv2.namedWindow(window_title, cv2.WINDOW_NORMAL)  \n",
    "        cv2.moveWindow(window_title, 20,20);\n",
    "#         imS = cv2.resize(screen, (800, 400)) \n",
    "#         cv2.imshow(window_title, screen)\n",
    "        cv2.imshow(window_title, processed)\n",
    "        cv2.waitKey(5)\n",
    "        cv2.destroyAllWindows\n",
    "        break\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set up a similar tracked environment as the jerk agent, but with a bit of extra stuff tacked in"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TrackedEnv(gym.Wrapper):\n",
    "    \"\"\"\n",
    "    An environment that tracks the current trajectory and\n",
    "    the total number of timesteps ever taken.\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self, env):\n",
    "        super(TrackedEnv, self).__init__(env)\n",
    "        self.action_history = []\n",
    "        self.reward_history = []\n",
    "        self.total_reward = 0\n",
    "        self.total_steps_ever = 0\n",
    "        record_file_path = output_dir+\"record.csv\"\n",
    "        self.record = pd.read_csv(record_file_path) if os.path.isfile(record_file_path) else pd.DataFrame(columns = ['Timesteps','Total_Score'])\n",
    "        actions_file_path = output_dir+\"actions.csv\"\n",
    "        self.actions = pd.read_csv(actions_file_path) if os.path.isfile(actions_file_path) else pd.DataFrame(columns = ['Action','Intention'])\n",
    "        \n",
    "    def best_sequence(self):\n",
    "        \"\"\"\n",
    "        Get the prefix of the trajectory with the best\n",
    "        cumulative reward.\n",
    "        \"\"\"\n",
    "        max_cumulative = max(self.reward_history)\n",
    "        for i, rew in enumerate(self.reward_history):\n",
    "            if rew == max_cumulative:\n",
    "                return self.action_history[:i + 1]\n",
    "        raise RuntimeError('unreachable')\n",
    "\n",
    "    # pylint: disable=E0202\n",
    "    def reset(self, **kwargs):\n",
    "        self.action_history = []\n",
    "        self.reward_history = []\n",
    "        self.total_reward = 0\n",
    "        return self.env.reset(**kwargs)\n",
    "\n",
    "    def step(self, action):\n",
    "        self.total_steps_ever += 1\n",
    "        self.action_history.append(action.copy())\n",
    "        obs, rew, done, info = self.env.step(action)\n",
    "        if done:\n",
    "            data = pd.DataFrame({'Timesteps':[len(self.action_history)], 'Total_Score': [self.total_reward]})\n",
    "            self.record = self.record.append(data)\n",
    "#             self.record.loc[len(loss_df)] = score\n",
    "#             self.record.append([self.total_reward, len(self.action_history)])\n",
    "            print('rip')\n",
    "        self.total_reward += rew\n",
    "        self.reward_history.append(self.total_reward)\n",
    "        return obs, rew, done, info\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hyperparameters, many of these are copied from the dino learning paper, and also the flappy bird paper referenced within."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EXPLOIT_BIAS = 0.25  # 0.;5\n",
    "TOTAL_TIMESTEPS = int(1e6)\n",
    "\n",
    "#game parameters\n",
    "ACTIONS = 8 # possible actions\n",
    "GAMMA = 0.99 # decay rate of past observations original 0.99\n",
    "OBSERVATION = 200000. # timesteps to observe before training\n",
    "EXPLORE = 200000  # frames over which to anneal epsilon\n",
    "FINAL_EPSILON = 0.0001 # final value of epsilon\n",
    "INITIAL_EPSILON = 0.1 # starting value of epsilon\n",
    "REPLAY_MEMORY = 100000 # number of previous transitions to remember\n",
    "BATCH = 32 # size of minibatch\n",
    "FRAMERATE= 4 #how often to render\n",
    "LEARNING_RATE = 1e-4\n",
    "img_rows , img_cols = 120,84\n",
    "img_channels = 4 #We stack 4 frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def buildmodel():\n",
    "    print(\"Now we build the model\")\n",
    "    model = Sequential()\n",
    "    model.add(Conv2D(32, (8, 8), strides=(4, 4), padding='same',input_shape=(img_cols,img_rows,img_channels)))  #20*40*4\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='same'))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='same'))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Flatten())\n",
    "    model.add(Dense(512))\n",
    "    model.add(Activation('relu'))\n",
    "    model.add(Dense(ACTIONS))\n",
    "    adam = Adam(lr=LEARNING_RATE)\n",
    "    model.compile(loss='mean_squared_logarithmic_error',optimizer=adam)\n",
    "    print(\"We finish building the model\")\n",
    "    return model\n",
    "# buildmodel().summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The actions and random move function are the commands we use to actually make actions on our state. Random move had the option to use a supplied value, for example when a move was predicted by the model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Possible actions\n",
    "# [\"B\", \"A\", \"MODE\", \"START\", \"UP\", \"DOWN\", \"LEFT\", \"RIGHT\", \"C\", \"Y\", \"X\", \"Z\"]\n",
    "actions = [\n",
    "    [True, False, False, False, False, False, False, False, False, False, False, False],\n",
    "    [True, False, False, False, False, False, False, True, False, False, False, False],\n",
    "    [False, False, False, False, False, False, False, True, False, False, False, False],\n",
    "    [False, False, False, False, False, True, False, False, False, False, False, False],\n",
    "    [True, False, False, False, False, True, False, False, False, False, False, False],\n",
    "    [False, False, False, False, False, False, False, False, False, False, False, False],\n",
    "    [False, False, False, False, False, False, True, False, False, False, False, False],\n",
    "    [True, False, False, False, False, False, True, False, False, False, False, False],\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def random_move(env, Choice=None):\n",
    "    done = False\n",
    "    if Choice is None:\n",
    "        Choice = random.randint(1,len(actions))-1\n",
    "    # no info variable in contest environment\n",
    "    obs, rew, done, _ = env.step(actions[Choice])\n",
    "    \n",
    "    return rew, done, obs, Choice\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`proccessObs` was used in image cropping and resizing, but was ultimately dropped. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def proccessObs(obs):\n",
    "    window_size_x = 180\n",
    "    window_size_y = 100\n",
    "    window_offset_x = 140 #int((320 - window_size_x)/2)\n",
    "    window_offset_y = 100 #int((224 - window_size_y)/2)\n",
    "    # lm = Image.fromarray(np.array(obs[window_offset_y:(window_offset_y+window_size_y),window_offset_x:(window_offset_x+window_size_x)]))\n",
    "    # lm.show()\n",
    "    # input(\"Press Enter to continue...\")\n",
    "    return obs[window_offset_y:(window_offset_y+window_size_y),window_offset_x:(window_offset_x+window_size_x)].flatten().tostring()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`process_img` takes a screen from the evironment and applies the resizing, edge detection, and color correction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_img(image):\n",
    "    #crop out the dino agent from the frame\n",
    "    height, width = image.shape[:2]\n",
    "    image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)\n",
    "#     print(height)\n",
    "#     print(width)\n",
    "    image = image[round(height*.25):height,round(width*.25):width] #img[y:y+h, x:x+w] \n",
    "    image = cv2.resize(image, (0,0), fx = 0.5, fy = 0.5) \n",
    "#     height, width = image.shape[:2]\n",
    "#     print(height)\n",
    "#     print(width)\n",
    "    image = cv2.Canny(image, threshold1 = 100, threshold2 = 200) #apply the canny edge detection\n",
    "    return  image   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def main(observe=False):\n",
    "    # Set up a new TrackedEnv that can keep track of total timestamps and store\n",
    "    # previous best solutions.\n",
    "    #\n",
    "    # env = grc.RemoteEnv('tmp/sock')\n",
    "    # env = TrackedEnv(env)\n",
    "\n",
    "    \n",
    "    env = retro.make(game='SonicTheHedgehog-Genesis',\n",
    "                     state='GreenHillZone.Act1',\n",
    "                     scenario='contest',\n",
    "                     record=output_dir)\n",
    "    env = TrackedEnv(env)\n",
    "\n",
    "    # new_ep will keep track of if a new episode should be started.\n",
    "    new_ep = True\n",
    "    # solutions is an array of successful gameplay sequences as well as the\n",
    "    \n",
    "    solutions = []\n",
    "    \n",
    "    model = buildmodel()\n",
    "    x_t = process_img(np.zeros((224,320,3), dtype=np.uint8))\n",
    "    \n",
    "    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)\n",
    "    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*20*40*4\n",
    "\n",
    "    initial_state = s_t\n",
    "    if observe :\n",
    "        OBSERVE = 999999999    #We keep observing, never train\n",
    "        epsilon = FINAL_EPSILON\n",
    "        print (\"Now we load weight\")\n",
    "        model.load_weights(\"model_final.h5\")\n",
    "        adam = Adam(lr=LEARNING_RATE)\n",
    "        model.compile(loss='mse',optimizer=adam)\n",
    "        print (\"Weight load successfully\")    \n",
    "    else:                       #We go to training mode\n",
    "        OBSERVE = OBSERVATION\n",
    "        epsilon = load_obj(\"epsilon\") \n",
    "#         model.load_weights(\"model_final.h5\")\n",
    "        adam = Adam(lr=LEARNING_RATE)\n",
    "        model.compile(loss='mse',optimizer=adam)\n",
    "        \n",
    "    t = 0 # start of timesteps\n",
    "    D = deque()\n",
    "    while True:\n",
    "        \n",
    "        loss = 0\n",
    "        Q_sa = 0\n",
    "        action_index = 0\n",
    "        r_t = 0 #reward at 4\n",
    "        a_t = np.zeros([ACTIONS]) # action at t\n",
    "        if new_ep:\n",
    "            clear_output(wait=True)\n",
    "            print('%f%% done, reward: %f' % (env.total_steps_ever / 10000, env.record[\"Total_Score\"].mean()))\n",
    "            \n",
    "#             if (solutions and\n",
    "#                     random.random() < EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS):\n",
    "#                 solutions = sorted(solutions, key=lambda x: np.mean(x[0]))\n",
    "#                 best_pair = solutions[-1]\n",
    "#                 new_rew = exploit(env, best_pair[1])\n",
    "#                 best_pair[0].append(new_rew)\n",
    "#                 print('replayed best with reward %f' % new_rew)\n",
    "#                 continue\n",
    "#             else:\n",
    "            env.reset()\n",
    "            new_ep = False\n",
    "        if  random.random() <= epsilon: #randomly explore an action\n",
    "#             print(\"----------Random Action----------\")\n",
    "            action_index = random.randrange(len(actions[:]))\n",
    "            env.actions.loc[len(env.actions)]= {'Action':action_index, 'Intention': 'Random'}\n",
    "\n",
    "           \n",
    "        else: # predict the output\n",
    "#             print(\"----------Predicted----------\")\n",
    "            q = model.predict(s_t)       #input a stack of 4 images, get the prediction\n",
    "            max_Q = np.argmax(q)         # chosing index with maximum q value\n",
    "            action_index = max_Q \n",
    "            env.actions.loc[len(env.actions)]= {'Action':action_index, 'Intention': 'Predicted'}\n",
    "   \n",
    "                \n",
    "        #We reduced the epsilon (exploration parameter) gradually\n",
    "        if epsilon > FINAL_EPSILON  and t > OBSERVE:\n",
    "            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE         \n",
    "                      \n",
    "#         action_index=2;\n",
    "#     x_t1 ~ obs? terminal ~ done\n",
    "        #run the selected action and observed next state and reward\n",
    "#         x_t1, r_t, terminal = game_state.get_state(a_t)\n",
    "#         print(t,'doing action',action_index)\n",
    "        reward, done, obs, choice = random_move(env,Choice=action_index)\n",
    "        x_t1 = process_img(obs)\n",
    "#         show_img(obs)\n",
    "#         if t % FRAMERATE ==0:\n",
    "#             env.render()\n",
    "        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x168x320x1\n",
    "        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) # append the new image to input stack and remove the first one\n",
    "        \n",
    "        D.append((s_t, action_index, reward, s_t1, done))\n",
    "\n",
    "        if len(D) > REPLAY_MEMORY:\n",
    "            D.popleft()\n",
    "            \n",
    "               #only train if done observing\n",
    "        if t > OBSERVE: \n",
    "            \n",
    "            #sample a minibatch to train on\n",
    "\n",
    "            minibatch = random.sample(D, BATCH)\n",
    "            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 20, 40, 4\n",
    "            targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2\n",
    "\n",
    "            #Now we do the experience replay\n",
    "            for i in range(0, len(minibatch)):\n",
    "                state_t = minibatch[i][0]    # 4D stack of images\n",
    "                action_t = minibatch[i][1]   #This is action index\n",
    "                reward_t = minibatch[i][2]   #reward at state_t due to action_t\n",
    "                state_t1 = minibatch[i][3]   #next state\n",
    "                terminal = minibatch[i][4]   #wheather the agent died or survided due the action\n",
    "                \n",
    "\n",
    "                inputs[i:i + 1] = state_t    \n",
    "\n",
    "                targets[i] = model.predict(state_t)  # predicted q values\n",
    "                Q_sa = model.predict(state_t1)      #predict q values for next step\n",
    "                \n",
    "                if terminal:\n",
    "                    targets[i, action_t] = reward_t # if terminated, only equals reward\n",
    "                else:\n",
    "                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)\n",
    "\n",
    "            loss += model.train_on_batch(inputs, targets)\n",
    "            loss_df.loc[len(loss_df)] = loss\n",
    "        s_t = initial_state if done else s_t1 #reset game to initial frame if terminate\n",
    "        t = t + 1\n",
    "#         env.render()\n",
    "#         print(t, env.total_reward )\n",
    "#         print(env.record)\n",
    "        state = \"\"\n",
    "        if t <= OBSERVE:\n",
    "            state = \"observe\"\n",
    "        elif t > OBSERVE and t <= OBSERVE + EXPLORE:\n",
    "            state = \"explore\"\n",
    "        else:\n",
    "            state = \"train\"\n",
    "        if t % 100 == 0:\n",
    "            print(\"T\", t, \"/ STATE\", state,\"/ ε\", round(epsilon,3), \"/ REWARD\", round(env.total_reward),\"/ Q_MAX \" , np.max(Q_sa), \"/ Loss \", loss)\n",
    "\n",
    "        if done:\n",
    "            new_ep = True\n",
    "        \n",
    "        if t % 1000 == 0:\n",
    "            model.save_weights(output_dir+\"model_weights.h5\", overwrite=True)\n",
    "#             save_obj(D,output_dir+\"D\") #saving episodes\n",
    "            save_obj(t,output_dir+\"time\") #caching time steps\n",
    "            save_obj(epsilon,output_dir+\"epsilon\") #cache epsilon to avoid repeated randomness in actions\n",
    "            loss_df.to_csv(output_dir+\"loss_df.csv\",index=False)\n",
    "            env.record.to_csv(output_dir+\"records.csv\",index=False)\n",
    "            env.actions.to_csv(output_dir+\"actions.csv\",index=False)\n",
    "            with open(\"model.json\", \"w\") as outfile:\n",
    "                json.dump(model.to_json(), outfile)\n",
    "        if t >1000000:\n",
    "            exit();\n",
    "# #         rew, new_ep = move_n_learn(env, 1)\n",
    "#         if not new_ep and rew <= 0:\n",
    "#             print('backtracking due to negative reward: %f' % rew)\n",
    "#             _, new_ep = move_n_learn(env, 70, left=True)\n",
    "#         if new_ep:\n",
    "#             solutions.append(([max(env.reward_history)], env.best_sequence()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "if __name__ == '__main__':\n",
    "    try:\n",
    "        main()\n",
    "    except gre.GymRemoteError as exc:\n",
    "        print('exception', exc)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finially, there a a function that can take the hardcoded folder name of a previous run, and show you the loss over time as well as the distribution in moves between the random & model predictions. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_plots():\n",
    "    rundir = 'run-20180519-14:14'\n",
    "    fig, axs = plt.subplots(ncols=1,nrows =3,figsize=(15,15))\n",
    "    axs[0].set_title('Loss')\n",
    "    axs[1].set_title('Game Score progress')\n",
    "    loss_df = pd.read_csv(\"./\"+rundir+\"/loss_df.csv\")#.set_yscale('log')\n",
    "    scores_df = pd.read_csv(\"./\"+rundir+\"/records.csv\")\n",
    "    actions_df = pd.read_csv(\"./\"+rundir+\"/actions.csv\")\n",
    "    actions_df['Action'] = actions_df['Action'].astype('float') \n",
    "    loss_df['loss'] = loss_df['loss'].astype('float') \n",
    "    loss_df.plot(use_index=True,ax=axs[0]).set_yscale('log')\n",
    "\n",
    "\n",
    "\n",
    "    sns.distplot(actions_df['Action'].loc[actions_df['Intention'] == 'Predicted'])\n",
    "    sns.distplot(actions_df['Action'].loc[actions_df['Intention'] == 'Random'])\n",
    "    scores_df.plot(ax=axs[1])\n",
    "    imgg = fig.canvas.draw()\n",
    "show_plots()"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Raw Cell Format",
  "kernelspec": {
   "display_name": "reverie_agent",
   "language": "python",
   "name": "reverie_agent"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }