Created
September 29, 2016 19:54
-
-
Save anonoz/cf7421a29818dd5a843371ee23408cb1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "aa8439c7-cf04-49a6-84f3-2e8af08aca7f" | |
} | |
}, | |
"source": [ | |
"# Sentiment Analysis using word vector and ConvNet in Keras" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "07039836-86fc-4567-b302-ef9265af8307" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5105)\n", | |
"/home/anonoz/anaconda2/envs/tensorflow/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.\n", | |
" warnings.warn(warn)\n" | |
] | |
} | |
], | |
"source": [ | |
"# CONFIGS\n", | |
"\n", | |
"# The word vector can be swapped with say GoogleNews 6B dataset\n", | |
"word_vector_bin_file = \"word2vec/model-0.bin\"\n", | |
"# word_vector_bin_file = \"word2vec/GoogleNews-vectors-negative300.bin\"\n", | |
"# word_vector_bin_file = \"model-0.bin\"\n", | |
"word_vector_dims = 100\n", | |
"\n", | |
"# in aclImdb, the longest review is 2470 words long\n", | |
"# Due to memory constraint, in this one I limit to 200 words\n", | |
"max_sentence_length = 200\n", | |
"\n", | |
"# Can easily swap with other datasets if you want\n", | |
"positive_review_txts = \"aclImdb/train/pos/*.txt\"\n", | |
"negative_review_txts = \"aclImdb/train/neg/*.txt\"\n", | |
"positive_review_vals = \"aclImdb/test/pos/*.txt\"\n", | |
"negative_review_vals = \"aclImdb/test/neg/*.txt\"\n", | |
"# positive_review_txts = \"polarity2/txt_sentoken/pos/*.txt\"\n", | |
"# negative_review_txts = \"polarity2/txt_sentoken/neg/*.txt\"\n", | |
"# positive_review_vals = \"polarity2/txt_sentoken/pos/*.txt\"\n", | |
"# negative_review_vals = \"polarity2/txt_sentoken/neg/*.txt\"\n", | |
"\n", | |
"# Test theano and graphics card\n", | |
"import theano.tensor as T\n", | |
"\n", | |
"# Notebook\n", | |
"%matplotlib inline\n", | |
"import matplotlib.pyplot as plt\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true, | |
"nbpresent": { | |
"id": "35ac46e5-404e-4d92-b9bd-073c346223a0" | |
} | |
}, | |
"source": [ | |
"## Step 1: Make X_train\n", | |
"\n", | |
"X_train data structure is a 3D array, consisting of reviews, words, and vectors:\n", | |
"```json\n", | |
"[\n", | |
" // a review\n", | |
" [\n", | |
" // a word, and its array of 100 vectors\n", | |
" [0.75, 0.64 ...],\n", | |
" ...\n", | |
" ], \n", | |
" ...\n", | |
"]\n", | |
"```\n", | |
"\n", | |
"### TODO:\n", | |
"1. Load all the reviews into memory\n", | |
"2. Normalize the text\n", | |
"3. Add words to vocab array to make word vector retrieval faster\n", | |
"3. Determine vocab size, max review length" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "bc104c55-9cb1-4d4c-a1bf-363277a69da8" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"omg this is soo funny \n", | |
"[u'omg', u'this', u'is', u'soo', u'funny']\n" | |
] | |
} | |
], | |
"source": [ | |
"from bs4 import BeautifulSoup \n", | |
"import re\n", | |
"\n", | |
"def normalise_text(text):\n", | |
" # Step 0: Clean HTML tags\n", | |
" normalised_text = BeautifulSoup(text, \"html.parser\").get_text()\n", | |
" \n", | |
" # Step 1: Lower cap\n", | |
" normalised_text = normalised_text.lower()\n", | |
" \n", | |
" # Step 2: Leave only abc behind\n", | |
" normalised_text = re.sub(\"[^a-zA-Z ]\", \" \", normalised_text)\n", | |
" \n", | |
" # Step 3: \n", | |
" return normalised_text\n", | |
"\n", | |
"# Test normalise_text\n", | |
"print(normalise_text(\"OMG <br/><br />this is soo00.... funny!!!\"))\n", | |
"\n", | |
"def raw_to_array(review_text):\n", | |
" return normalise_text(review_text).split()\n", | |
"\n", | |
"# Test both fx above\n", | |
"print(raw_to_array(\"OMG <br/><br />this is soo00.... funny!!!\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "789db32e-3bb2-49b4-8ecc-e8f9f0d67a7b" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('highest word count: ', 2487)\n", | |
"[u'i', u'remember', u'watching', u'this', u'late', u'at', u'night', u'on', u'black', u'and', u'white', u'tv', u'long', u'before', u'a', u'live', u'action', u'version', u'was', u'so']\n", | |
"('mean of review length: ', 236.82848000000001)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[<matplotlib.lines.Line2D at 0x7fc3c8190290>]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEACAYAAACtVTGuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGeVJREFUeJzt3X+wVdV99/H3VxEQfyCRIAoqomgwplKN+EQTc5MYNam/\n6syTGjOT5GkykzZanSczfaKZyUA7nUlNG6tpx3QmP6q2JsbYidHUCFq9Sc1EJVEDCgGEgICA+FtE\n4HL9Pn+sfb0nCJsL3HPPufe+XzNn7j7r7H3O3pvD+uy11t77RGYiSdLO7NPqFZAktTeDQpJUy6CQ\nJNUyKCRJtQwKSVItg0KSVGuXQRERkyPigYh4KiIWRMRfVeWzImJ1RDxWPc5rWOaaiFgaEYsi4pyG\n8lMiYn5ELImI65uzSZKk/hS7uo4iIiYCEzPziYg4EPgNcBHwZ8BrmXnddvNPB74PnAZMBu4HpmVm\nRsQjwBWZOS8i7gFuyMw5/b5VkqR+s8sWRWauy8wnqumNwCJgUvVy7GCRi4DbMnNbZq4AlgIzq8A5\nKDPnVfPdAly8l+svSWqy3RqjiIgpwAzgkaroioh4IiK+ExFjq7JJwKqGxdZUZZOA1Q3lq+kNHElS\nm+pzUFTdTncAV1UtixuBqZk5A1gHfKM5qyhJaqURfZkpIkZQQuLfM/MnAJm5oWGWbwN3V9NrgCMb\nXptcle2sfEef5w2oJGkPZOaOhgT2Sl9bFN8DFmbmDT0F1ZhDj0uAJ6vpu4BLI2JkRBwDHAc8mpnr\ngFciYmZEBPBp4Cc7+8DM9JHJrFmzWr4O7fJwX7gv3Bc7fnR1Jfvu27zj6122KCLiTOBTwIKIeBxI\n4CvAZRExA3gTWAF8oargF0bE7cBCoAv4Ymb2bMHlwE3AaOCezLy3X7dGktTvdhkUmflLYN8dvLTT\nSj4zvwZ8bQflvwHeszsrKElqLa/MbnMdHR2tXoW24b7o5b7o5b5ovl1ecNcKEZHtuF6S1I62bYPR\no6G7O8gWDmZLkoYpg0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1TIoJEm1DApJGuSafX2yQSFJ\nQ0D0+/XYvQwKSVItg0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1TIoJEm1DApJUi2DQpJUy6CQ\nJNUyKCRJtQwKSVItg0KSBjlvMy5J2iVvMy5JahmDQpJUy6CQJNUyKCRJtQwKSVItg0KSVMugkCTV\nMigkSbUMCklSrV0GRURMjogHIuKpiFgQEVdW5eMiYm5ELI6IORExtmGZayJiaUQsiohzGspPiYj5\nEbEkIq5vziZJkvpTX1oU24AvZea7gfcBl0fEu4Crgfsz8wTgAeAagIg4EfgEMB34GHBjxFsXl38L\n+FxmHg8cHxHn9uvWSJL63S6DIjPXZeYT1fRGYBEwGbgIuLma7Wbg4mr6QuC2zNyWmSuApcDMiJgI\nHJSZ86r5bmlYRpLUpnZrjCIipgAzgIeBwzJzPZQwASZUs00CVjUstqYqmwSsbihfXZVJkvZCs+8e\nO6KvM0bEgcAdwFWZuTEitl+1fl3V2bNnvzXd0dFBR0dHf769JA16nZ2ddHZ2sm0bdHc373Mi+xBF\nETEC+Cnws8y8oSpbBHRk5vqqW+nBzJweEVcDmZnXVvPdC8wCVvbMU5VfCnwwM/9yB5+XfVkvSRJs\n2QIHHwxbtwaZ2e83HO9r19P3gIU9IVG5C/hsNf0Z4CcN5ZdGxMiIOAY4Dni06p56JSJmVoPbn25Y\nRpLUpnbZooiIM4FfAAso3UsJfAV4FLgdOJLSWvhEZr5cLXMN8Dmgi9JVNbcqPxW4CRgN3JOZV+3k\nM21RSFIfNbtF0aeup4FmUEhS37VL15MkaZgyKCRJtQwKSVItg0KSVMugkCTVMigkSbUMCklSLYNC\nklTLoJAk1TIoJGmQa/aNLAwKSRrkMiH6/cYdvQwKSRrkDApJUi2DQpJUy6CQJNUyKCRJtQwKSVIt\ng0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1TIoJEm1DApJUi2DQpJUy6CQJNUyKCRJtQwKSVIt\ng0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1Wp5UETEdyNifUTMbyibFRGrI+Kx6nFew2vXRMTS\niFgUEec0lJ8SEfMjYklEXN//myJJw1PLgwL4N+DcHZRfl5mnVI97ASJiOvAJYDrwMeDGiLdW/1vA\n5zLzeOD4iNjRe0qSdlPLgyIzHwJe2sFLO1qti4DbMnNbZq4AlgIzI2IicFBmzqvmuwW4eM9WWZK0\nvVa3KHbmioh4IiK+ExFjq7JJwKqGedZUZZOA1Q3lq6sySdJeymzu++9pUNwITM3MGcA64Bv9t0qS\npN3R7K6nEXuyUGZuaHj6beDuanoNcGTDa5Orsp2V79Ts2bPfmu7o6KCjo2NPVlWShqzOzk46Ozt5\n4QV48cXmfU5kH9osETEFuDsz31M9n5iZ66rp/wuclpmXRcSJwK3A6ZSupfuAaZmZEfEwcCUwD/gv\n4Js9g+A7+Lzsy3pJkmDxYrjgAli6NMjMfm9b7LJFERHfBzqAQyPiGWAW8KGImAG8CawAvgCQmQsj\n4nZgIdAFfLGhxr8cuAkYDdyzs5CQJO2eZnc99alFMdBsUUhS3y1aBH/6p7B4cXNaFF6ZLUmDXMuv\no5AktTeDQpJUy6CQJNUyKCRJtQwKSVItg0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1TIoJEm1\nDApJUi2DQpJUy6CQJNUyKCRJtQwKSVItg0KSVMugkCTVMigkSbUMCklSLYNCklTLoJAk1TIoJEm1\nDApJUq3ubth33+a9v0EhSYOcQSFJqmVQSJJqGRSSpFoGhSSpVnc3jBjRvPc3KCRpkNu2zRaFJKmG\nXU+SpFoGhSSplkEhSarV8qCIiO9GxPqImN9QNi4i5kbE4oiYExFjG167JiKWRsSiiDinofyUiJgf\nEUsi4vr+3xRJGp66ulp/1tO/AeduV3Y1cH9mngA8AFwDEBEnAp8ApgMfA26MeOtWVd8CPpeZxwPH\nR8T27ylJ2gOrV8OkSc17/10GRWY+BLy0XfFFwM3V9M3AxdX0hcBtmbktM1cAS4GZETEROCgz51Xz\n3dKwjCRpLzz9NEyd2rz339MxigmZuR4gM9cBE6ryScCqhvnWVGWTgNUN5aurMknSXti6FX7+czjt\ntOZ9Rn8NZmc/vY8kaTd84xswYwaccELzPmNPhz/WR8Rhmbm+6lZ6ripfAxzZMN/kqmxn5Ts1e/bs\nt6Y7Ojro6OjYw1WVpKHpwQc7ue66Ti64ABqqzH4XmbtuDETEFODuzHxP9fxa4MXMvDYivgyMy8yr\nq8HsW4HTKV1L9wHTMjMj4mHgSmAe8F/ANzPz3p18XvZlvSRpOPvVr+CTn4Tly2GffSAiyMx+/627\nXbYoIuL7QAdwaEQ8A8wC/h74UUT8ObCScqYTmbkwIm4HFgJdwBcbavzLgZuA0cA9OwsJSVLf/OAH\n8NnPlpBopj61KAaaLQpJ2rXTT4evfhXOP788b1aLwqCQpEGouxsOOQRWrSp/oXlB4S08JGkQWrAA\njjiiNySayaCQpEHowQfhQx8amM8yKCRpEJozB04+eWA+yzEKSRqEDjwQHnkE3v3u3jIHsyVJAKxb\nB4cfXga0G0+NdTBbkgTA/ffDBRc0//qJHgaFJA0yc+fCe987cJ9nUEjSINLdDf/933DJJQP3mQaF\nJA0i994LY8fCSScN3GcaFJI0iMyZA5ddNrCf6VlPkjRIdHfD5MnwwAMwffrbX/esJ0ka5h56qNyy\n413vGtjPNSgkaZD4+tfhiisg+r3NUM+uJ0kaBJ58Es4+u/xI0ZgxO57HridJGsZuvBH+4i92HhLN\nZItCktrca6/B0UfD/PllMHtnbFFI0jD1z/8MH/1ofUg0ky0KSWpjW7eW1sT99//hnWJ3xBaFJA1D\n3/kOTJu265BophGt+2hJUp2uLvjmN+Ff/qW162GLQpLa1D/8Q+l2+shHWrsejlFIUhtatAjOOgse\nfhiOPbZvy/gLd5I0jJx7bnl86Ut9X6ZZQeEYhSS1mfvugyVL4O67W70mhWMUktRGtmwp93P6x3+E\nkSNbvTaFXU+S1Ea+8AV49lm4667dv/mfXU+SNMTNmVO6mxYtGvg7xNYxKCSpDcybB5/6FPzwh+Wn\nTtuJYxSS1GIrV8L555ersFt9zcSOGBSS1EKbNsFFF8FVV8HFF7d6bXbMwWxJapHXX4dLLik/b/qD\nH8A+e3no7k0BJWkIeeEF6OiA8ePh1lv3PiSaqY1XTZKGppdfhg98AN73PviP/4ARbX5akUEhSQPo\nxRfhzDPhQx+CG25or9Ngd8agkKQBsmJFCYlzzim3Dh8MIQF7GRQRsSIifhsRj0fEo1XZuIiYGxGL\nI2JORIxtmP+aiFgaEYsi4py9XXlJGiwefBDOOAM+/3m47rrBExKwl2c9RcRy4NTMfKmh7Frghcz8\nekR8GRiXmVdHxInArcBpwGTgfmDajk5v8qwnSUNFd3fpYvra1+Cmm+BP/qR5n9Wut/AI3t4quQj4\nYDV9M9AJXA1cCNyWmduAFRGxFJgJPLKX6yBJbWnDBrjsMnjttd37XYl2s7djFAncFxHzIuLzVdlh\nmbkeIDPXAROq8knAqoZl11RlkjTk/OhH5Xeup0+Hhx4avCEBe9+iODMz10bEO4G5EbGYEh6N9qgP\nafbs2W9Nd3R00NHRsafrKEkD5umn4a//Gh5/vITFBz+462X2VGdnJ52dnc37gEq/XZkdEbOAjcDn\ngY7MXB8RE4EHM3N6RFwNZGZeW81/LzArM9/W9eQYhaTBZtOmMkj9T/8El18OX/kKjB49sOvQdldm\nR8SYiDiwmj4AOAdYANwFfLaa7TPAT6rpu4BLI2JkRBwDHAc8uqefL0ntoKsLrr0Wpk6FJ56ARx6B\nv/3bgQ+JZtqbrqfDgB9HRFbvc2tmzo2IXwO3R8SfAyuBTwBk5sKIuB1YCHQBX7TZIGmw6u6GO+6A\nr34VjjsO7r8fTjqp1WvVHN4UUJJ2Q1cX3Hkn/M3fwAEHlL/nntse10W06+mxkjQsbN4MN98Mf/d3\nMGlSuS7i/PPbIyCazaCQpBq//W25UO622+A97yk38WvmmUztyHs9SdJ2tmyB//zPEggf/ziMGgVz\n55bHcAsJsEUhSW957jm48Ub413+F448v92W69FIYObLVa9ZaBoWkYS0T/ud/4HvfK4PUl1wC991X\nuplUeNaTpGEnE556Cn74wzL+sP/+pfXwmc/AYYe1eu32nGc9SdJe2LwZfvEL+PGP4ac/hTffLK2H\nO++EU09t9dq1N4NC0pCUWa6UnjOnDEI/+mi5Qd+FF8I995SL44bDqa39wa4nSUPGs8+WHwj62c/K\nldKjR8PHPgYf/Wg5W+nQQ1u9hs3VrK4ng0LSoPX66+UW3g88UFoNy5bBWWeVcDj7bDjhhFav4cAy\nKCQNe889Bz//eXn88pewcCH88R/Dhz9cWg1nnjm8T2U1KCQNK93dsGhRGYD+5S/LL8Rt2AAzZ5bW\nwplnlkHoMWNavabtw6CQNGS9+WYJhccfhwULysDzr38N48fD+99fQuGMM8ovxu27b6vXtn0ZFJKG\nhExYswYeewzmzSsthl//Gt75ztKNNGNGeZxxRilT3xkUkgadri6YPx9+8xt48snSYpg/v4wjnHRS\n6UY66yw4/fTSetDeMSgkta0334QlS8rg8sKF5Y6rCxbA8uVwzDHw3vfCySeXcDj1VJgwwWsYmsGg\nkNRSmfD88yUQnnqqPJYtg6efht//vlT+f/RH5ZTUk0/unXaweeAYFJKabtOmUuk//TQsXlzGEpYt\ng6VLYd26Ms9xx5UrnE86qfxO9LRpBkK7MCgk7bXXXy9XLy9fXsLg2WdhxYoSBL//Pbz0Ehx7LBx9\ndKn8jzqqPJ86tfyq2/jxdhm1M4NCUq1MeO21UuGvXl1CYNkyWLmyPJYtg1deKXdHPfbYMnYwaVJp\nIUyZUh6TJ8M+/pzZoGVQSMPYG2/A2rUlAJ5/Hp55pnQFrVxZWgErV5aWQURpBRxxRG8r4KijShhM\nnVpCYjhfuTzUGRTSELN1K2zcWAJgwwZYtaqEwIoV8OqrpfJ//vnyd+PGUuGPH1+O+idMKCEwZQqM\nG1f+TpoE73hHizdKLWVQSG1u27ZypL9xYznaf/HFcrS/dm0ZG1i9ugTAunUlGNavhwMPLBX9kUf2\nVv5HHFHucjpxYmkB9ISBVyRrVwwKaYBs2lQq9J7K/fXXS+X+0ktles2a8ve558oR/6uvltZAV1ep\n3MePh7FjS4V/wAElBA46qJRPmACHHNLbOhg9utVbq6HEoJD6ILN06fRU6G+80XsUv3lzGcztmX7x\nxVLRb95cQuC558r0pk3lCH7//UvFP25cqdwPP7yUHXZY6eI56KDSDTRmTAmDAw7wjCC1lj+FqiEt\ns3TFbN7c23e/bh1s2VIezz5bKvAtW0ql/sILZfq113rn27SphMPIkTBqVKnYx44tR+2TJpWKfP/9\ny/SYMeX5pEmlbP/9eyv9Aw6A/fZr9R6R2octCu2Wrq5SKW/dWh7PP18q657nL70EL7/c+7ynn37r\n1lKRr11bwmDLllLZv/xymd68uRy1jx1bKvqRI3sr8VGjSp/9uHFluqeyHz26PI44olTwo0aVo327\nczRc2fWkXXrzzVIhr11b/nZ1lQHWrq4yeLppU5nuqex7jsR7yl55pVTsPcu+9lpZruf1rq7y+v77\nlyPukSPLYOyECb3Px4wp3TU9lX1P982oUb0V+UEHlekDD+ydd9QoB2ulvWVQDCEbN5Y+9M2bS0X8\nyiulct68uVTyb7zR2/2yfn1vJb1589sr91dfLUfmXV0lKPbbrxx9H3wwjBhRno8YUZ6PG1ee9zze\n+c5SWfc8Hz26dNf0VPIjR5bno0b1znPIIZ6HL7Urg6INdXeXrpc1a8rR96pVJQCeeaY8X7++VOIb\nN/YOrG7eXCr1Qw4pFfDBB5cj8lGjSgXcc8ZMYz/76NHl+Y4q9555Ro4sR+QOpkrDl0HRAhs39t4H\np+e+OKtX914Vu2pVqeiPOqr3DJhDDikV//jxZbqn/3zixDLv6NHlud0skvqbQdEEPRdI9dwHZ/ny\nMt0TDBs3loudpk0rtz84/PByj5yJE8vpkEce6dkxktqHQbGHMsvR/5IlvQGwdGnvrZMPPrjcB+eo\no0oY9Nw2uee+OB75SxosDIpd2LCh91bJTz1VfmVr+fJSNnJkuWXyCSf0hsDUqeWe+uPGNWkjJGmA\nDfugyCzjA8uXw+9+V35UZcmS3q4jKJX/lCnw7neXH1U54YRyX31vlCZpOBgyQRER5wHXA/sA383M\na3cwT65dm/z0p/Dww70thP32Ky2C448vrYFjjy3jBNOmlZaB99GXNJwNiaCIiH2AJcBHgGeBecCl\nmfm77ebLsWOTs8+Gs84qrYMTTyyDyMNNZ2cnHR0drV6NtuC+6OW+6OW+6NWsoBjoY/CZwNLMXJmZ\nXcBtwEU7mnHZMrjjDrjySvjwh4dnSED5T6DCfdHLfdHLfdF8Ax0Uk4BVDc9XV2Vvc+ihA7I+kqRd\nsFdfklRroMco/hcwOzPPq55fDeT2A9oR0X6nYknSIDAUBrP3BRZTBrPXAo8Cn8zMRQO2EpKk3TKg\nP1yUmd0RcQUwl97TYw0JSWpjbXnBnSSpfbTVYHZEnBcRv4uIJRHx5VavT7NExIqI+G1EPB4Rj1Zl\n4yJibkQsjog5ETG2Yf5rImJpRCyKiHMayk+JiPnV/rq+FduyuyLiuxGxPiLmN5T127ZHxMiIuK1a\n5lcRcdTAbd3u2cm+mBURqyPisepxXsNrQ3JfRMTkiHggIp6KiAURcWVVPuy+FzvYF39Vlbf2e5GZ\nbfGghNbTwNHAfsATwLtavV5N2tblwLjtyq4F/l81/WXg76vpE4HHKd2EU6p91NMSfAQ4rZq+Bzi3\n1dvWh21/PzADmN+MbQf+Erixmv4z4LZWb/Nu7otZwJd2MO/0obovgInAjGr6QMo45ruG4/eiZl+0\n9HvRTi2KPl+MNwQEb2/NXQTcXE3fDFxcTV9I+YfclpkrgKXAzIiYCByUmfOq+W5pWKZtZeZDwEvb\nFffntje+1x2UEyfa0k72BZTvx/YuYojui8xcl5lPVNMbgUXAZIbh92In+6LnWrOWfS/aKSj6fDHe\nEJDAfRExLyI+X5UdlpnroXxZgAlV+fb7ZU1VNomyj3oM5v01oR+3/a1lMrMbeDkiBtttIa+IiCci\n4jsN3S3DYl9ExBRKK+th+vf/xGDeF49URS37XrRTUAwnZ2bmKcDHgcsj4gOU8Gg0nM8y6M9tH2w/\nDnsjMDUzZwDrgG/043u39b6IiAMpR7hXVUfTzfw/Mdj2RUu/F+0UFGuAxkGVyVXZkJOZa6u/G4A7\nKd1u6yPiMICq2fhcNfsa4MiGxXv2y87KB6P+3Pa3Xoty3c7Bmfli81a9f2Xmhqw6j4FvU74bMMT3\nRUSMoFSM/56ZP6mKh+X3Ykf7otXfi3YKinnAcRFxdESMBC4F7mrxOvW7iBhTHS0QEQcA5wALKNv6\n2Wq2zwA9/1nuAi6tzlQ4BjgOeLRqir8SETMjIoBPNyzT7oI/PIrpz22/q3oPgP8NPNC0regff7Av\nqgqxxyXAk9X0UN8X3wMWZuYNDWXD9Xvxtn3R8u9Fq0f5txvBP48yyr8UuLrV69OkbTyGckbX45SA\nuLoqfwdwf7X9c4FDGpa5hnI2wyLgnIbyU6v3WArc0Opt6+P2f59yi/ktwDPA/wHG9de2A6OA26vy\nh4Eprd7m3dwXtwDzq+/InZR++iG9L4Azge6G/xePVXVBv/2fGAL7oqXfCy+4kyTVaqeuJ0lSGzIo\nJEm1DApJUi2DQpJUy6CQJNUyKCRJtQwKSVItg0KSVOv/A8ciqp2TdQvLAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x7fc3d5633750>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import glob\n", | |
"\n", | |
"# highest word count shall be the convnet rows\n", | |
"highest_review_word_count = 0\n", | |
"positive_reviews = []\n", | |
"negative_reviews = []\n", | |
"positive_testrev = []\n", | |
"negative_testrev = []\n", | |
"\n", | |
"# to visualise review length\n", | |
"training_review_length = []\n", | |
"\n", | |
"for txt in glob.glob(positive_review_txts):\n", | |
" with (open(txt, 'r')) as f:\n", | |
" word_array = raw_to_array(f.read())\n", | |
" highest_review_word_count = max(highest_review_word_count, len(word_array))\n", | |
" positive_reviews.append(word_array)\n", | |
" training_review_length.append(len(word_array))\n", | |
"\n", | |
"for txt in glob.glob(negative_review_txts):\n", | |
" with (open(txt, 'r')) as f:\n", | |
" word_array = raw_to_array(f.read())\n", | |
" highest_review_word_count = max(highest_review_word_count, len(word_array))\n", | |
" negative_reviews.append(raw_to_array(f.read()))\n", | |
" training_review_length.append(len(word_array))\n", | |
" \n", | |
"# For validation purposes\n", | |
"for txt in glob.glob(positive_review_vals):\n", | |
" with (open(txt, 'r')) as f:\n", | |
" word_array = raw_to_array(f.read())\n", | |
" highest_review_word_count = max(highest_review_word_count, len(word_array))\n", | |
" positive_testrev.append(word_array)\n", | |
"\n", | |
"for txt in glob.glob(negative_review_vals):\n", | |
" with (open(txt, 'r')) as f:\n", | |
" word_array = raw_to_array(f.read())\n", | |
" highest_review_word_count = max(highest_review_word_count, len(word_array))\n", | |
" negative_testrev.append(raw_to_array(f.read()))\n", | |
" \n", | |
"\n", | |
"print('highest word count: ', highest_review_word_count)\n", | |
"print(positive_reviews[0][:20])\n", | |
"\n", | |
"# Plotting length\n", | |
"training_review_length = np.sort(np.array(training_review_length))\n", | |
"print('mean of review length: ', np.mean(training_review_length))\n", | |
"plt.plot(training_review_length)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"nbpresent": { | |
"id": "1fb917cc-0afc-424f-8b88-400a90ad6819" | |
} | |
}, | |
"source": [ | |
"## Step 2: Assign vector to vocabs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "43426664-5c77-45e8-8e50-bd112537d092" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('Vocab size: ', 75077)\n" | |
] | |
} | |
], | |
"source": [ | |
"from collections import defaultdict\n", | |
"\n", | |
"vocab = defaultdict(float)\n", | |
"for review in positive_reviews + negative_reviews + positive_testrev + negative_testrev:\n", | |
" for word in review:\n", | |
" vocab[word] += 1\n", | |
" \n", | |
"print(\"Vocab size: \", len(vocab))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "088e0cfc-b19d-480a-87e1-73539d08efcd" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(['vocab_size', 116046], ['layer1_size', 100])\n", | |
"('vocab hit: ', 58744)\n" | |
] | |
} | |
], | |
"source": [ | |
"import sys\n", | |
"word_vecs = {}\n", | |
"vocab_hit = 0\n", | |
"\n", | |
"# Code copies from yoonkim CNN's process_data.py\n", | |
"with open(word_vector_bin_file, 'rb') as f:\n", | |
" header = f.readline()\n", | |
" vocab_size, layer1_size = map(int, header.split())\n", | |
" print(['vocab_size', vocab_size], ['layer1_size', layer1_size])\n", | |
"# word_vector_dims = layer1_size\n", | |
" binary_len = np.dtype('float32').itemsize * layer1_size\n", | |
" for line in range(vocab_size):\n", | |
" word = []\n", | |
" while True:\n", | |
" ch = f.read(1)\n", | |
" if ch == ' ':\n", | |
" word = ''.join(word).lower()\n", | |
" break\n", | |
" if ch != '\\n':\n", | |
" word.append(ch)\n", | |
" if word in vocab:\n", | |
" word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')\n", | |
" vocab_hit += 1\n", | |
" else:\n", | |
" f.read(binary_len)\n", | |
"\n", | |
"print('vocab hit: ', vocab_hit)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "170beea1-a455-4375-b6fb-4c84778b2a4a" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ 0.20345445 -0.3446292 -0.79000551 0.50081944 0.44940072 -0.06276506\n", | |
" -0.22317918 -0.25703317 -0.12849531 -0.31195965 -0.5679732 0.15336281\n", | |
" -0.27494311 0.20641154 -0.37620348 -0.18761899 -0.02728397 0.1838602\n", | |
" 0.47868598 -0.49992049 -0.15919879 0.36610898 -0.26038209 0.11431786\n", | |
" -0.00538804 0.33008578 -0.26670018 0.08674013 -0.55320936 -0.13692944\n", | |
" 0.21666591 0.1827794 0.26853284 0.17944637 0.63986886 -0.23207539\n", | |
" 0.13950627 -0.25690758 0.8048203 -0.22281611 -0.30845243 0.40280464\n", | |
" -0.09700917 -0.10624079 -0.63829452 0.06724913 0.58268034 0.71300983\n", | |
" 0.21445699 0.07173178 -0.10021135 0.052795 0.21011689 0.55990034\n", | |
" 0.3317692 -0.1204156 0.33604714 -0.1214296 0.06541372 0.11289533\n", | |
" -0.33093852 -0.36012915 0.07092347 -0.05363626 0.05315193 -0.40072322\n", | |
" -0.37435323 0.54634482 0.04995376 0.4135814 0.11987102 0.35045838\n", | |
" -0.22226219 -0.00525038 0.23082729 0.25800541 0.19155909 -0.05843235\n", | |
" -0.24793234 0.16106944 0.21992229 -0.36523741 -0.85971153 -0.15568981\n", | |
" -0.27584159 -0.13528582 0.07193217 -0.0341942 -0.03699406 -0.10003386\n", | |
" 0.16112548 -0.27273464 0.14367266 -0.05410241 0.15156566 -0.09456853\n", | |
" 0.33454007 0.14035556 0.5199551 0.19529557]\n" | |
] | |
} | |
], | |
"source": [ | |
"print(word_vecs['awesome'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "cdb3fecf-044e-49af-a897-5a213e02698c" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('missing vocabs: ', [u'tsukino', u'natilie', u'nordisk', u'happeningb', u'hmoney', u'sangreal', u'bedknob', u'gabreil', u'remendados', u'umte', u'bocka', u'enchelada', u'spracht', u'rumbustious', u'familiarness', u'bendar', u'macmurphy', u'donger', u'bendan', u'vibrational'])\n" | |
] | |
} | |
], | |
"source": [ | |
"# Check if there are words missing from vector \n", | |
"# (may happen if we use other word vectors than the one we trained)\n", | |
"missing_vocabs = []\n", | |
"for word in vocab:\n", | |
" if word not in word_vecs and vocab[word] >= 1:\n", | |
" word_vecs[word] = np.random.uniform(-0.25, 0.25, word_vector_dims)\n", | |
" missing_vocabs.append(word)\n", | |
"\n", | |
"print('missing vocabs: ', missing_vocabs[:20])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "3817220e-8d9c-49d3-9d64-a48c03ea2430" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"total_reviews = len(positive_reviews) + len(negative_reviews)\n", | |
"max_test = 12500\n", | |
"\n", | |
"X_train = []\n", | |
"Y_train = []\n", | |
"X_test = []\n", | |
"Y_test = []\n", | |
"\n", | |
"# Due to memory constraint, we only take vectors for first 200 words in movie reviews\n", | |
"for i, review in enumerate(positive_reviews):\n", | |
"# word_vector_array = np.ndarray(shape=(max_sentence_length, word_vector_dims), dtype='float32')\n", | |
" word_vector_array = np.full((max_sentence_length, word_vector_dims), 0, dtype='float32')\n", | |
" for j, word in enumerate(review):\n", | |
" word_vector_array[j] = word_vecs[word][:word_vector_dims]\n", | |
" if j == max_sentence_length -1:\n", | |
" break\n", | |
" X_train.append(word_vector_array)\n", | |
" Y_train.append([0, 1])\n", | |
" \n", | |
"for i, review in enumerate(negative_reviews):\n", | |
" word_vector_array = np.full((max_sentence_length, word_vector_dims), 0, dtype='float32')\n", | |
" for j, word in enumerate(review):\n", | |
" word_vector_array[j] = word_vecs[word][:word_vector_dims]\n", | |
" if j == max_sentence_length - 1:\n", | |
" break\n", | |
" X_train.append(word_vector_array)\n", | |
" Y_train.append([1, 0])\n", | |
" \n", | |
"# For validation purposes\n", | |
"for i, review in enumerate(positive_testrev):\n", | |
" word_vector_array = np.full((max_sentence_length, word_vector_dims), 0, dtype='float32')\n", | |
" for j, word in enumerate(review):\n", | |
" word_vector_array[j] = word_vecs[word][:word_vector_dims]\n", | |
" if j == max_sentence_length -1:\n", | |
" break\n", | |
" X_test.append(word_vector_array)\n", | |
" Y_test.append([0, 1])\n", | |
" if i == max_test:\n", | |
" break\n", | |
" \n", | |
"for i, review in enumerate(negative_testrev):\n", | |
" word_vector_array = np.full((max_sentence_length, word_vector_dims), 0, dtype='float32')\n", | |
" for j, word in enumerate(review):\n", | |
" word_vector_array[j] = word_vecs[word][:word_vector_dims]\n", | |
" if j == max_sentence_length - 1:\n", | |
" break\n", | |
" X_test.append(word_vector_array)\n", | |
" Y_test.append([1, 0])\n", | |
" if i == max_test:\n", | |
" break\n", | |
"\n", | |
"# print(X_train[0][0][100])\n", | |
"# print(Y_train[0].shape)\n", | |
"\n", | |
"X_train = np.array(X_train)\n", | |
"Y_train = np.array(Y_train)\n", | |
"X_test = np.array(X_test)\n", | |
"Y_test = np.array(Y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "5941a2a4-8f01-4077-919e-2f1d724e3cff" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(25000, 200, 100)\n", | |
"(25000, 2)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(X_train.shape)\n", | |
"print(Y_train.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"nbpresent": { | |
"id": "70003047-3373-4d72-87f2-5708a31122c3" | |
} | |
}, | |
"source": [ | |
"## Step 3: Keras\n", | |
"\n", | |
"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "3ba15d56-a8a4-4d7a-8c7f-9465f81ac6a4" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using Theano backend.\n" | |
] | |
} | |
], | |
"source": [ | |
"from keras.models import Model, Sequential\n", | |
"from keras.layers.convolutional import Convolution1D, Convolution2D\n", | |
"from keras.layers.pooling import MaxPooling1D, MaxPooling2D\n", | |
"from keras.layers import Merge, Dense, Dropout, Activation, Input, Flatten\n", | |
"from keras.optimizers import SGD" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "7bf80616-2457-4d9d-b3c9-dc7050365dfa" | |
}, | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Based on the paper, there are filters of various sizes\n", | |
"filters = 150\n", | |
"epochs = 5\n", | |
"\n", | |
"layer1_filter_sizes = [3,4,5]\n", | |
"layer1_convs = []\n", | |
"\n", | |
"graph_in = Input(shape=(max_sentence_length, word_vector_dims))\n", | |
"\n", | |
"for filter_size in layer1_filter_sizes:\n", | |
" conv = Convolution1D(filters,\n", | |
" filter_size,\n", | |
" border_mode = 'valid',\n", | |
" activation='relu',\n", | |
" subsample_length=1)(graph_in)\n", | |
" pool = MaxPooling1D(pool_length=2)(conv)\n", | |
" flatten = Flatten()(pool)\n", | |
" layer1_convs.append(flatten)\n", | |
"\n", | |
"# Merge the conv\n", | |
"merged = Merge(mode='concat')(layer1_convs)\n", | |
"graph = Model(input=graph_in, output=merged)\n", | |
"\n", | |
"final_model = Sequential()\n", | |
"final_model.add(graph)\n", | |
"# final_model.add(Dense(32))\n", | |
"# final_model.add(Activation('relu'))\n", | |
"# final_model.add(Dropout(0.25))\n", | |
"final_model.add(Dense(16))\n", | |
"final_model.add(Activation('relu'))\n", | |
"final_model.add(Dropout(0.5))\n", | |
"final_model.add(Dense(2))\n", | |
"final_model.add(Activation('softmax'))\n", | |
"\n", | |
"sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)\n", | |
"final_model.compile(loss='binary_crossentropy',\n", | |
" optimizer='rmsprop',\n", | |
" metrics=['accuracy'])\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"nbpresent": { | |
"id": "5a605fa8-fc27-44b6-b98f-51ab208c9930" | |
} | |
}, | |
"source": [ | |
"## Step 4: TRAIN THIS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false, | |
"nbpresent": { | |
"id": "082122aa-9d01-40d0-bac7-8476a8e2098f" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Train on 25000 samples, validate on 25000 samples\n", | |
"Epoch 1/5\n", | |
"25000/25000 [==============================] - 22s - loss: 0.0756 - acc: 0.9823 - val_loss: 1.1921e-07 - val_acc: 1.0000\n", | |
"Epoch 2/5\n", | |
"25000/25000 [==============================] - 22s - loss: 0.0534 - acc: 0.9834 - val_loss: 1.1921e-07 - val_acc: 1.0000\n", | |
"Epoch 3/5\n", | |
"25000/25000 [==============================] - 22s - loss: 0.0528 - acc: 0.9827 - val_loss: 1.1921e-07 - val_acc: 1.0000\n", | |
"Epoch 4/5\n", | |
"25000/25000 [==============================] - 23s - loss: 0.0495 - acc: 0.9842 - val_loss: 1.1921e-07 - val_acc: 1.0000\n", | |
"Epoch 5/5\n", | |
"25000/25000 [==============================] - 22s - loss: 0.0453 - acc: 0.9861 - val_loss: 1.1921e-07 - val_acc: 1.0000\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<keras.callbacks.History at 0x7fc2cad44e10>" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"final_model.fit(X_train, \n", | |
" Y_train, \n", | |
" batch_size=32,\n", | |
" nb_epoch=epochs,\n", | |
" validation_data=[X_test, Y_test])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**!!** Something is wrong. The validation accuracy is too high." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"final_model.save('jarvis.h5')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 5: Custom Predict\n", | |
"\n", | |
"Will implement for small bit of text prediction later.\n", | |
"\n", | |
"Somehow the predictions are dead wrong here." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def test_predict(text):\n", | |
" word_array = raw_to_array(text)\n", | |
" word_vec_array = np.full(fill_value=0.0,\n", | |
" shape=(1, max_sentence_length, word_vector_dims),\n", | |
" dtype='float32')\n", | |
" for i, word in enumerate(word_array):\n", | |
" if word in word_vecs:\n", | |
" word_vec_array[0][i] = np.array(word_vecs[word][:word_vector_dims])\n", | |
" else:\n", | |
" word_vec_array[0][i] = np.random.uniform(-0.25, 0.25, word_vector_dims) \n", | |
" if i + 1 >= max_sentence_length:\n", | |
" break\n", | |
"# return final_model.predict(word_vec_array, batch_size=1)\n", | |
" prediction = final_model.predict(word_vec_array, batch_size=1)[0]\n", | |
" if prediction[1] > prediction[0]:\n", | |
" return ['movie review is positive', prediction]\n", | |
" else:\n", | |
" return ['movie review is negative', prediction]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 5.67092002e-06, 9.99994278e-01], dtype=float32)]" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict(\"OMG this warcraft film adaptation was so awesome\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is negative', array([ 0.95606565, 0.04393429], dtype=float32)]" | |
] | |
}, | |
"execution_count": 64, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict(\"gosh this is just bad\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 1.83029895e-14, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 65, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict(\"train to busan was one of the most value for money movie one will ever pay for\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 4.78235685e-04, 9.99521732e-01], dtype=float32)]" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict(\"Blackhat is not only disappointing, its embarrassing\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive', array([ 0., 1.], dtype=float32)]" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('''\n", | |
"Suffers from inconsistencies, both technical and story wise. They change the shooting styles, cameras, fps, warmth/cold - for no apparent reasons at all. Feels like it's not clear what this movie \"wants to be\". The main character is supposed to be a \"super-hacker\" but doesn't do anything \"super hack-y\", just wanders around, shooting people, and nails the female protagonist. Doesn't have many hacking-scenes for a \"hackers movie\", has tons of boring gun-scenes instead, from some reason. The motivation of the villain was, not interesting. References many other \"movie-cliches\" (not in a good way). Severely lacks humor. The few jokes in it are really cheesy (yeah, it's not a comedy , but comic reliefs are important). Many of the audience members left the theater in the middle or before the end\n", | |
"''')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 2.26467456e-10, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('explosive summer flick that will keep you on the couch for hours')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is negative', array([ 0.61056298, 0.38943696], dtype=float32)]" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('why would anyone watch this?')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive', array([ 0.0224589 , 0.97754115], dtype=float32)]" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('some people actually walked out from cinema')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 7.43133866e-09, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('this is definitely the best flick from christopher nolan yet!')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is negative', array([ 0.9827314 , 0.01726856], dtype=float32)]" | |
] | |
}, | |
"execution_count": 56, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('i dug my eyes out')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 1.04985540e-08, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('seriously this is the one you must watch this year')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is negative',\n", | |
" array([ 9.99516845e-01, 4.83212119e-04], dtype=float32)]" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('touching love story indeed')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 3.33922645e-09, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 67, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('would love to lie on the grassfield and watch this with her again')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['movie review is positive',\n", | |
" array([ 2.70553046e-12, 1.00000000e+00], dtype=float32)]" | |
] | |
}, | |
"execution_count": 69, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_predict('I bet there are more productive things to do than watching this film')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [tensorflow]", | |
"language": "python", | |
"name": "Python [tensorflow]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
}, | |
"nbpresent": { | |
"slides": { | |
"37e74d51-8e3e-45ef-a6e1-8a70764d8838": { | |
"id": "37e74d51-8e3e-45ef-a6e1-8a70764d8838", | |
"prev": "ba25d42f-178f-4bff-878f-237edd09c5f5", | |
"regions": { | |
"02d29844-ddc7-4301-9e0f-8cf2188ea9a7": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "cdb3fecf-044e-49af-a897-5a213e02698c", | |
"part": "whole" | |
}, | |
"id": "02d29844-ddc7-4301-9e0f-8cf2188ea9a7" | |
} | |
}, | |
"theme": null | |
}, | |
"3961229a-9df7-4080-8adb-51386f2bc3bd": { | |
"id": "3961229a-9df7-4080-8adb-51386f2bc3bd", | |
"prev": "53407f4b-429c-4df1-9d5d-c26d568e5a4f", | |
"regions": { | |
"62e52504-c547-4256-809d-50995400803a": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "789db32e-3bb2-49b4-8ecc-e8f9f0d67a7b", | |
"part": "whole" | |
}, | |
"id": "62e52504-c547-4256-809d-50995400803a" | |
} | |
} | |
}, | |
"53407f4b-429c-4df1-9d5d-c26d568e5a4f": { | |
"id": "53407f4b-429c-4df1-9d5d-c26d568e5a4f", | |
"prev": "af0c81d8-7d10-422d-a598-1cb99dee2e2c", | |
"regions": { | |
"3222cf87-ef46-4651-8cf2-80a78e9eca3b": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "bc104c55-9cb1-4d4c-a1bf-363277a69da8", | |
"part": "whole" | |
}, | |
"id": "3222cf87-ef46-4651-8cf2-80a78e9eca3b" | |
} | |
} | |
}, | |
"64728ebe-592c-4d5b-baa4-dc089e2704fb": { | |
"id": "64728ebe-592c-4d5b-baa4-dc089e2704fb", | |
"prev": "99bda4b6-a416-4ffc-9fa9-77a771dd27ec", | |
"regions": { | |
"8f248628-70ae-48f5-9bd1-eb1cfdf6e2f6": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "07039836-86fc-4567-b302-ef9265af8307", | |
"part": "whole" | |
}, | |
"id": "8f248628-70ae-48f5-9bd1-eb1cfdf6e2f6" | |
} | |
} | |
}, | |
"875c94af-0074-4c3e-8c45-e90c2ed94612": { | |
"id": "875c94af-0074-4c3e-8c45-e90c2ed94612", | |
"prev": "fb703c45-7b3e-47cb-a3cb-90072b339ac7", | |
"regions": { | |
"1dd617cb-85a2-4e96-8f9d-a180b24413b1": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "088e0cfc-b19d-480a-87e1-73539d08efcd", | |
"part": "whole" | |
}, | |
"id": "1dd617cb-85a2-4e96-8f9d-a180b24413b1" | |
} | |
} | |
}, | |
"8ce9e396-9e36-48d8-8dad-652e0874f01c": { | |
"id": "8ce9e396-9e36-48d8-8dad-652e0874f01c", | |
"prev": "eeb7b456-444b-44b5-a9ce-47c172546952", | |
"regions": { | |
"851f0ea5-8a8c-4a2b-9105-1d56ba2ef2d7": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "082122aa-9d01-40d0-bac7-8476a8e2098f", | |
"part": "whole" | |
}, | |
"id": "851f0ea5-8a8c-4a2b-9105-1d56ba2ef2d7" | |
} | |
} | |
}, | |
"905771fb-bf2d-42a7-aec2-5da33f44c325": { | |
"id": "905771fb-bf2d-42a7-aec2-5da33f44c325", | |
"prev": "8ce9e396-9e36-48d8-8dad-652e0874f01c", | |
"regions": { | |
"63519927-4a11-499b-b1a4-ac6dbd9e8bb0": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "3a8186e0-2504-4f64-9086-fe7ab528f2e3", | |
"part": "whole" | |
}, | |
"id": "63519927-4a11-499b-b1a4-ac6dbd9e8bb0" | |
} | |
} | |
}, | |
"91908d9f-ed0e-4588-a13b-308c73734071": { | |
"id": "91908d9f-ed0e-4588-a13b-308c73734071", | |
"prev": "905771fb-bf2d-42a7-aec2-5da33f44c325", | |
"regions": { | |
"52d4555d-d45e-4115-98e4-21d8340c2257": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "60e00169-ee44-4c0c-9c55-7121d4c8332f", | |
"part": "whole" | |
}, | |
"id": "52d4555d-d45e-4115-98e4-21d8340c2257" | |
} | |
} | |
}, | |
"99bda4b6-a416-4ffc-9fa9-77a771dd27ec": { | |
"id": "99bda4b6-a416-4ffc-9fa9-77a771dd27ec", | |
"prev": null, | |
"regions": { | |
"3e966305-d8cd-4860-8194-9e4460aebbfe": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.0833250996187788, | |
"y": 0.10830039485643014 | |
}, | |
"id": "3e966305-d8cd-4860-8194-9e4460aebbfe" | |
}, | |
"81e1e2bd-e729-4e37-8913-bf2162a507bd": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.03863636659710592, | |
"y": 0.09407114653112136 | |
}, | |
"content": { | |
"cell": "aa8439c7-cf04-49a6-84f3-2e8af08aca7f", | |
"part": "whole" | |
}, | |
"id": "81e1e2bd-e729-4e37-8913-bf2162a507bd" | |
} | |
} | |
}, | |
"9aeefc9a-ead7-4fff-b68f-12374636e8c3": { | |
"id": "9aeefc9a-ead7-4fff-b68f-12374636e8c3", | |
"prev": "dc88abf0-a5b8-43d9-b06f-0f6172443154", | |
"regions": { | |
"11975b9d-2c03-4d62-b3f8-273c4a929ac5": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "5941a2a4-8f01-4077-919e-2f1d724e3cff", | |
"part": "whole" | |
}, | |
"id": "11975b9d-2c03-4d62-b3f8-273c4a929ac5" | |
} | |
} | |
}, | |
"a4bcb0ed-33c6-45a4-a6b6-6fda4109f64f": { | |
"id": "a4bcb0ed-33c6-45a4-a6b6-6fda4109f64f", | |
"prev": "3961229a-9df7-4080-8adb-51386f2bc3bd", | |
"regions": { | |
"2cbebc16-1a38-4f7c-ad9a-ad220db57799": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "1fb917cc-0afc-424f-8b88-400a90ad6819", | |
"part": "whole" | |
}, | |
"id": "2cbebc16-1a38-4f7c-ad9a-ad220db57799" | |
} | |
} | |
}, | |
"af0c81d8-7d10-422d-a598-1cb99dee2e2c": { | |
"id": "af0c81d8-7d10-422d-a598-1cb99dee2e2c", | |
"prev": "64728ebe-592c-4d5b-baa4-dc089e2704fb", | |
"regions": { | |
"934404bd-cff4-4c2d-a5c2-093f7660a160": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "35ac46e5-404e-4d92-b9bd-073c346223a0", | |
"part": "whole" | |
}, | |
"id": "934404bd-cff4-4c2d-a5c2-093f7660a160" | |
} | |
} | |
}, | |
"ba25d42f-178f-4bff-878f-237edd09c5f5": { | |
"id": "ba25d42f-178f-4bff-878f-237edd09c5f5", | |
"prev": "875c94af-0074-4c3e-8c45-e90c2ed94612", | |
"regions": { | |
"39ec72be-7f2a-49bc-a862-d0ddaf0741fa": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "170beea1-a455-4375-b6fb-4c84778b2a4a", | |
"part": "whole" | |
}, | |
"id": "39ec72be-7f2a-49bc-a862-d0ddaf0741fa" | |
} | |
} | |
}, | |
"bd7a242b-b4ee-4781-abd6-45dc7d6604fb": { | |
"id": "bd7a242b-b4ee-4781-abd6-45dc7d6604fb", | |
"prev": "91908d9f-ed0e-4588-a13b-308c73734071", | |
"regions": { | |
"dd046f4e-2e3d-444a-9d20-5929545381d9": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "a1b9ac65-1c0d-436c-8c4c-6aec5c59e4e9", | |
"part": "whole" | |
}, | |
"id": "dd046f4e-2e3d-444a-9d20-5929545381d9" | |
} | |
} | |
}, | |
"c08bcf99-f04e-494d-bc58-e666d4ec92ba": { | |
"id": "c08bcf99-f04e-494d-bc58-e666d4ec92ba", | |
"prev": "d5f1666e-7e38-4606-8790-1cd4b70da45e", | |
"regions": { | |
"b561ca37-037f-454c-a483-740986aa86d9": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "3ba15d56-a8a4-4d7a-8c7f-9465f81ac6a4", | |
"part": "whole" | |
}, | |
"id": "b561ca37-037f-454c-a483-740986aa86d9" | |
} | |
} | |
}, | |
"cff711ff-48f0-4015-b6af-b53eef839e16": { | |
"id": "cff711ff-48f0-4015-b6af-b53eef839e16", | |
"prev": "bd7a242b-b4ee-4781-abd6-45dc7d6604fb", | |
"regions": { | |
"e90e7eda-38c0-49b8-b941-042573be21a2": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "041f0afc-f01c-42ae-8937-a81d76ccc74b", | |
"part": "whole" | |
}, | |
"id": "e90e7eda-38c0-49b8-b941-042573be21a2" | |
} | |
} | |
}, | |
"d393470b-f821-447b-b90a-8a873f5615c6": { | |
"id": "d393470b-f821-447b-b90a-8a873f5615c6", | |
"prev": "c08bcf99-f04e-494d-bc58-e666d4ec92ba", | |
"regions": { | |
"010c3eef-e5a8-4833-b61d-5d9497a35284": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "7bf80616-2457-4d9d-b3c9-dc7050365dfa", | |
"part": "whole" | |
}, | |
"id": "010c3eef-e5a8-4833-b61d-5d9497a35284" | |
} | |
} | |
}, | |
"d5f1666e-7e38-4606-8790-1cd4b70da45e": { | |
"id": "d5f1666e-7e38-4606-8790-1cd4b70da45e", | |
"prev": "9aeefc9a-ead7-4fff-b68f-12374636e8c3", | |
"regions": { | |
"5683aaa1-f7d4-49a9-a332-b8e6b72e355c": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "70003047-3373-4d72-87f2-5708a31122c3", | |
"part": "whole" | |
}, | |
"id": "5683aaa1-f7d4-49a9-a332-b8e6b72e355c" | |
} | |
} | |
}, | |
"dc88abf0-a5b8-43d9-b06f-0f6172443154": { | |
"id": "dc88abf0-a5b8-43d9-b06f-0f6172443154", | |
"prev": "37e74d51-8e3e-45ef-a6e1-8a70764d8838", | |
"regions": { | |
"e0e421c5-9d16-4e11-991f-3010779f786f": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "3817220e-8d9c-49d3-9d64-a48c03ea2430", | |
"part": "whole" | |
}, | |
"id": "e0e421c5-9d16-4e11-991f-3010779f786f" | |
} | |
} | |
}, | |
"eeb7b456-444b-44b5-a9ce-47c172546952": { | |
"id": "eeb7b456-444b-44b5-a9ce-47c172546952", | |
"prev": "d393470b-f821-447b-b90a-8a873f5615c6", | |
"regions": { | |
"c89b5a8c-d9fc-4756-b536-0340b41e8d41": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "5a605fa8-fc27-44b6-b98f-51ab208c9930", | |
"part": "whole" | |
}, | |
"id": "c89b5a8c-d9fc-4756-b536-0340b41e8d41" | |
} | |
} | |
}, | |
"fb703c45-7b3e-47cb-a3cb-90072b339ac7": { | |
"id": "fb703c45-7b3e-47cb-a3cb-90072b339ac7", | |
"prev": "a4bcb0ed-33c6-45a4-a6b6-6fda4109f64f", | |
"regions": { | |
"fe6d4f25-6a25-4a89-981d-509609414a42": { | |
"attrs": { | |
"height": 0.8, | |
"width": 0.8, | |
"x": 0.1, | |
"y": 0.1 | |
}, | |
"content": { | |
"cell": "43426664-5c77-45e8-8e50-bd112537d092", | |
"part": "whole" | |
}, | |
"id": "fe6d4f25-6a25-4a89-981d-509609414a42" | |
} | |
} | |
} | |
}, | |
"themes": {} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment