stevenhao · February 6, 2017 00:59
diff --git a/practical2.ipynb b/practical2.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "from random import shuffle, random\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "import urllib.request\n",
    "import zipfile\n",
    "import lxml.etree\n",
    "\n",
    "# Download the dataset if it's not already there: this may take a minute as it is 75MB\n",
    "if not os.path.isfile('ted_en-20160408.zip'):\n",
    "    urllib.request.urlretrieve(\"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\", filename=\"ted_en-20160408.zip\")\n",
    "\n",
    "# For now, we're only interested in the subtitle text, so let's extract that from the XML:\n",
    "with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:\n",
    "    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n",
    "input_text = '\\n'.join(doc.xpath('//content/text()'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "talks = list((doc.getroot().iterchildren()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_class(keywords):\n",
    "    keywords = [s.strip().lower() for s in keywords.split(',')]\n",
    "    ted = [\n",
    "        ('technology', 'T'),\n",
    "        ('entertainment', 'E'),\n",
    "        ('design', 'D')\n",
    "    ]\n",
    "    return ''.join(y if x in keywords else 'o' for x, y in ted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# returns all the words in a string, treating non-alphanumeric characters as separators\n",
    "def get_tokens(s):\n",
    "    tokens = re.sub(r\"[^a-z0-9]+\", \" \", s.lower()).split()\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# returns a map from word to frequency\n",
    "def get_frequencies(lst):\n",
    "    result = defaultdict(int)\n",
    "    for word in lst:\n",
    "        result[word] += 1\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Extract list of all sentences from TED talk corpus.\n",
    "## Maybe we should use Wiki instead?\n",
    "input_text = '\\n'.join(doc.xpath('//content/text()'))\n",
    "input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentences_strings_ted = []\n",
    "for line in input_text_noparens.split('\\n'):\n",
    "    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)\n",
    "    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentences_ted = []\n",
    "for sent_str in sentences_strings_ted:\n",
    "    sentences_ted.append(get_tokens(sent_str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "SPECIAL_TOKEN = '*' # a non-alphanumeric token, guaranteed to not occur in the dictionary\n",
    "\n",
    "## Randomly map a few low-frequency words to the special token\n",
    "all_words = []\n",
    "for sentence in sentences_ted:\n",
    "    all_words.extend(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_freqs = get_frequencies(all_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "threshold = 2\n",
    "p = 0.1\n",
    "# print ('threshold:', threshold)\n",
    "# print ('p:', p)\n",
    "cnt = 0\n",
    "for s in sentences_ted:\n",
    "    for i in range(len(s)):\n",
    "        if all_freqs[s[i]] <= threshold and random() < p:\n",
    "            cnt += 1\n",
    "            s[i] = SPECIAL_TOKEN\n",
    "# print ('cnt:', cnt)\n",
    "# print ('total:', len(all_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Get a Word2Vec model, for embedding our text strings.\n",
    "from gensim.models import Word2Vec\n",
    "model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def embed_word(w):\n",
    "    if w not in model_ted:\n",
    "        w = SPECIAL_TOKEN\n",
    "    return model_ted[w]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "''' Simple bag of means embedding model\n",
    "\n",
    "Given a list of words W,\n",
    "x = 1/N * sum(x_w for w in W), where N = len(W)\n",
    "'''\n",
    "def bag_of_means(text):\n",
    "    W = get_tokens(text)\n",
    "    return sum(embed_word(w) for w in W) / len(W)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "LABELS = ['ooo', 'ooD', 'oEo', 'oED', 'Too', 'ToD', 'TEo', 'TED']\n",
    "''' processes a talk\n",
    "\n",
    "returns (<embedding> (1D np.array [dim=100]), <label> (int))\n",
    "'''\n",
    "def process_talk(talk):\n",
    "    text = talk.cssselect('content')[0].text\n",
    "    label = get_class(talk.cssselect('keywords')[0].text)\n",
    "    return bag_of_means(text), LABELS.index(label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "processed_talks = [process_talk(t) for t in talks]\n",
    "all_inputs, all_labels = zip(*processed_talks)\n",
    "all_inputs, all_labels = np.array(all_inputs), np.array(all_labels)\n",
    "\n",
    "NUM_TRAIN = 1585\n",
    "NUM_VALID = 250\n",
    "NUM_TEST = 250\n",
    "assert(NUM_TRAIN+NUM_VALID+NUM_TEST == len(processed_talks))\n",
    "train_set = all_inputs[0:NUM_TRAIN], all_labels[0:NUM_TRAIN]\n",
    "valid_set = all_inputs[NUM_TRAIN:NUM_TRAIN+NUM_VALID], all_labels[NUM_TRAIN:NUM_TRAIN+NUM_VALID]\n",
    "test_set = all_inputs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST], all_labels[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "TRAINING_BATCH_SIZE = 50\n",
    "''' extracts training data from train_set\n",
    "\n",
    "returns list of input, label pairs.\n",
    "each input, label pair corresponds to a batch of 50 talks.\n",
    "input: 2D torch.Tensor [dimensions 50x100]\n",
    "label: 1D torch.Tensor [dimension 50]\n",
    "'''\n",
    "def make_training_data():\n",
    "    train_inputs, train_labels = train_set\n",
    "    training_data = []\n",
    "    ar = list(range(NUM_TRAIN))\n",
    "    random.shuffle(ar) # shuffle it each time to make new batches\n",
    "    for i in range(0, NUM_TRAIN, TRAINING_BATCH_SIZE): # this will leave out the last 35 talks, but that's ok\n",
    "        section = ar[i:i+50]\n",
    "        inputs, labels = train_inputs[section], train_labels[section]\n",
    "        # inputs: list (size=50) of np.arrays (of dim 100)\n",
    "        # labels: list (size=50) of integers (between 0...7)\n",
    "        \n",
    "        # we do NOT want one-hot vectors for the labels -- they should just be np.arrays\n",
    "        inputs = torch.Tensor(np.array(inputs))\n",
    "        labels = torch.LongTensor(labels)\n",
    "        training_data.append((inputs, labels))\n",
    "    return training_data\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "from torch.autograd import Variable\n",
    "\n",
    "class Net(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(Net, self).__init__()\n",
    "        self.lin1 = nn.Linear(100, 32) # model_ted was created as size=100\n",
    "        self.lin2 = nn.Linear(32, 16)\n",
    "        self.lin3 = nn.Linear(16, 8) # there are 8 labels\n",
    "    def forward(self, x):\n",
    "        x = F.relu(self.lin1(x))\n",
    "        x = F.relu(self.lin2(x))\n",
    "        x = F.softmax(self.lin3(x))\n",
    "        return x\n",
    "\n",
    "net = Net()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import torch.optim as optim\n",
    "optimizer = optim.Adam(net.parameters(), lr = 0.01)\n",
    "criterion = nn.CrossEntropyLoss() # loss = -log(p_y) (cross entropy criterion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average loss: 1.759\n",
      "average loss: 1.660\n",
      "average loss: 1.662\n",
      "average loss: 1.659\n",
      "average loss: 1.661\n",
      "Finished Training\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(5): # loop over the dataset multiple times\n",
    "    running_loss = 0.0\n",
    "    training_data = make_training_data()\n",
    "    for data in training_data:\n",
    "        inputs, labels = data\n",
    "\n",
    "        # wrap them in a variable\n",
    "        inputs, labels = Variable(inputs), Variable(labels)\n",
    "        \n",
    "        # zero the parameter gradients\n",
    "        optimizer.zero_grad()\n",
    "        \n",
    "        # forward + backward + optimize\n",
    "        outputs = net(inputs)\n",
    "        loss = criterion(outputs, labels)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "        \n",
    "        running_loss += loss.data[0]\n",
    "    # print statistics\n",
    "    print('average loss: %.3f' % (1. * running_loss / len(training_data)))\n",
    "print('Finished Training')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def inspect(inputs, labels):\n",
    "    inputs = Variable(torch.Tensor(inputs))\n",
    "    labels = torch.LongTensor(labels)\n",
    "    outputs = net(inputs)\n",
    "    _, predicted = torch.max(outputs.data, 1)\n",
    "    actual_counts = [(predicted == i).sum() for i in range(8)]\n",
    "    expected_counts = [(labels == i).sum() for i in range(8)]\n",
    "    print(expected_counts)\n",
    "    print(actual_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[972, 109, 99, 10, 281, 79, 19, 16]\n",
      "[1585, 0, 0, 0, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "# clearly our model is flawed -- it's merely predicting 'ooo' for everything.\n",
    "inspect(train_set[0], train_set[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def judge(inputs, labels):\n",
    "    inputs = Variable(torch.Tensor(inputs))\n",
    "    labels = torch.LongTensor(labels)\n",
    "    outputs = net(inputs)\n",
    "    _, predicted = torch.max(outputs.data, 1) # prediction = arg max_y’ (p_y’)\n",
    "    total = predicted.size(0)\n",
    "    correct = (predicted == labels).sum()\n",
    "    print('Accuracy: %d/%d = %.2d %%' % (correct, total, 100.*correct/total))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 972/1585 = 61 %\n"
     ]
    }
   ],
   "source": [
    "judge(train_set[0], train_set[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 74/250 = 29 %\n"
     ]
    }
   ],
   "source": [
    "judge(test_set[0], test_set[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 88/250 = 35 %\n"
     ]
    }
   ],
   "source": [
    "judge(valid_set[0], valid_set[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[74, 26, 41, 6, 58, 26, 9, 10]\n",
      "[250, 0, 0, 0, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "inspect(test_set[0], test_set[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import os\n",
	"from random import shuffle, random\n",
	"from collections import defaultdict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import re\n",
	"import urllib.request\n",
	"import zipfile\n",
	"import lxml.etree\n",
	"\n",
	"# Download the dataset if it's not already there: this may take a minute as it is 75MB\n",
	"if not os.path.isfile('ted_en-20160408.zip'):\n",
	" urllib.request.urlretrieve(\"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\", filename=\"ted_en-20160408.zip\")\n",
	"\n",
	"# For now, we're only interested in the subtitle text, so let's extract that from the XML:\n",
	"with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:\n",
	" doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n",
	"input_text = '\\n'.join(doc.xpath('//content/text()'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"talks = list((doc.getroot().iterchildren()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_class(keywords):\n",
	" keywords = [s.strip().lower() for s in keywords.split(',')]\n",
	" ted = [\n",
	" ('technology', 'T'),\n",
	" ('entertainment', 'E'),\n",
	" ('design', 'D')\n",
	" ]\n",
	" return ''.join(y if x in keywords else 'o' for x, y in ted)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# returns all the words in a string, treating non-alphanumeric characters as separators\n",
	"def get_tokens(s):\n",
	" tokens = re.sub(r\"[^a-z0-9]+\", \" \", s.lower()).split()\n",
	" return tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# returns a map from word to frequency\n",
	"def get_frequencies(lst):\n",
	" result = defaultdict(int)\n",
	" for word in lst:\n",
	" result[word] += 1\n",
	" return result"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"## Extract list of all sentences from TED talk corpus.\n",
	"## Maybe we should use Wiki instead?\n",
	"input_text = '\\n'.join(doc.xpath('//content/text()'))\n",
	"input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sentences_strings_ted = []\n",
	"for line in input_text_noparens.split('\\n'):\n",
	" m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)\n",
	" sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sentences_ted = []\n",
	"for sent_str in sentences_strings_ted:\n",
	" sentences_ted.append(get_tokens(sent_str))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"SPECIAL_TOKEN = '*' # a non-alphanumeric token, guaranteed to not occur in the dictionary\n",
	"\n",
	"## Randomly map a few low-frequency words to the special token\n",
	"all_words = []\n",
	"for sentence in sentences_ted:\n",
	" all_words.extend(sentence)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"all_freqs = get_frequencies(all_words)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"threshold = 2\n",
	"p = 0.1\n",
	"# print ('threshold:', threshold)\n",
	"# print ('p:', p)\n",
	"cnt = 0\n",
	"for s in sentences_ted:\n",
	" for i in range(len(s)):\n",
	" if all_freqs[s[i]] <= threshold and random() < p:\n",
	" cnt += 1\n",
	" s[i] = SPECIAL_TOKEN\n",
	"# print ('cnt:', cnt)\n",
	"# print ('total:', len(all_words))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"## Get a Word2Vec model, for embedding our text strings.\n",
	"from gensim.models import Word2Vec\n",
	"model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def embed_word(w):\n",
	" if w not in model_ted:\n",
	" w = SPECIAL_TOKEN\n",
	" return model_ted[w]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"''' Simple bag of means embedding model\n",
	"\n",
	"Given a list of words W,\n",
	"x = 1/N * sum(x_w for w in W), where N = len(W)\n",
	"'''\n",
	"def bag_of_means(text):\n",
	" W = get_tokens(text)\n",
	" return sum(embed_word(w) for w in W) / len(W)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"LABELS = ['ooo', 'ooD', 'oEo', 'oED', 'Too', 'ToD', 'TEo', 'TED']\n",
	"''' processes a talk\n",
	"\n",
	"returns (<embedding> (1D np.array [dim=100]), <label> (int))\n",
	"'''\n",
	"def process_talk(talk):\n",
	" text = talk.cssselect('content')[0].text\n",
	" label = get_class(talk.cssselect('keywords')[0].text)\n",
	" return bag_of_means(text), LABELS.index(label)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"processed_talks = [process_talk(t) for t in talks]\n",
	"all_inputs, all_labels = zip(*processed_talks)\n",
	"all_inputs, all_labels = np.array(all_inputs), np.array(all_labels)\n",
	"\n",
	"NUM_TRAIN = 1585\n",
	"NUM_VALID = 250\n",
	"NUM_TEST = 250\n",
	"assert(NUM_TRAIN+NUM_VALID+NUM_TEST == len(processed_talks))\n",
	"train_set = all_inputs[0:NUM_TRAIN], all_labels[0:NUM_TRAIN]\n",
	"valid_set = all_inputs[NUM_TRAIN:NUM_TRAIN+NUM_VALID], all_labels[NUM_TRAIN:NUM_TRAIN+NUM_VALID]\n",
	"test_set = all_inputs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST], all_labels[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import random\n",
	"\n",
	"TRAINING_BATCH_SIZE = 50\n",
	"''' extracts training data from train_set\n",
	"\n",
	"returns list of input, label pairs.\n",
	"each input, label pair corresponds to a batch of 50 talks.\n",
	"input: 2D torch.Tensor [dimensions 50x100]\n",
	"label: 1D torch.Tensor [dimension 50]\n",
	"'''\n",
	"def make_training_data():\n",
	" train_inputs, train_labels = train_set\n",
	" training_data = []\n",
	" ar = list(range(NUM_TRAIN))\n",
	" random.shuffle(ar) # shuffle it each time to make new batches\n",
	" for i in range(0, NUM_TRAIN, TRAINING_BATCH_SIZE): # this will leave out the last 35 talks, but that's ok\n",
	" section = ar[i:i+50]\n",
	" inputs, labels = train_inputs[section], train_labels[section]\n",
	" # inputs: list (size=50) of np.arrays (of dim 100)\n",
	" # labels: list (size=50) of integers (between 0...7)\n",
	" \n",
	" # we do NOT want one-hot vectors for the labels -- they should just be np.arrays\n",
	" inputs = torch.Tensor(np.array(inputs))\n",
	" labels = torch.LongTensor(labels)\n",
	" training_data.append((inputs, labels))\n",
	" return training_data\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F\n",
	"\n",
	"from torch.autograd import Variable\n",
	"\n",
	"class Net(nn.Module):\n",
	" def __init__(self):\n",
	" super(Net, self).__init__()\n",
	" self.lin1 = nn.Linear(100, 32) # model_ted was created as size=100\n",
	" self.lin2 = nn.Linear(32, 16)\n",
	" self.lin3 = nn.Linear(16, 8) # there are 8 labels\n",
	" def forward(self, x):\n",
	" x = F.relu(self.lin1(x))\n",
	" x = F.relu(self.lin2(x))\n",
	" x = F.softmax(self.lin3(x))\n",
	" return x\n",
	"\n",
	"net = Net()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import torch.optim as optim\n",
	"optimizer = optim.Adam(net.parameters(), lr = 0.01)\n",
	"criterion = nn.CrossEntropyLoss() # loss = -log(p_y) (cross entropy criterion)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"average loss: 1.759\n",
	"average loss: 1.660\n",
	"average loss: 1.662\n",
	"average loss: 1.659\n",
	"average loss: 1.661\n",
	"Finished Training\n"
	]
	}
	],
	"source": [
	"for epoch in range(5): # loop over the dataset multiple times\n",
	" running_loss = 0.0\n",
	" training_data = make_training_data()\n",
	" for data in training_data:\n",
	" inputs, labels = data\n",
	"\n",
	" # wrap them in a variable\n",
	" inputs, labels = Variable(inputs), Variable(labels)\n",
	" \n",
	" # zero the parameter gradients\n",
	" optimizer.zero_grad()\n",
	" \n",
	" # forward + backward + optimize\n",
	" outputs = net(inputs)\n",
	" loss = criterion(outputs, labels)\n",
	" loss.backward()\n",
	" optimizer.step()\n",
	" \n",
	" running_loss += loss.data[0]\n",
	" # print statistics\n",
	" print('average loss: %.3f' % (1. * running_loss / len(training_data)))\n",
	"print('Finished Training')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def inspect(inputs, labels):\n",
	" inputs = Variable(torch.Tensor(inputs))\n",
	" labels = torch.LongTensor(labels)\n",
	" outputs = net(inputs)\n",
	" _, predicted = torch.max(outputs.data, 1)\n",
	" actual_counts = [(predicted == i).sum() for i in range(8)]\n",
	" expected_counts = [(labels == i).sum() for i in range(8)]\n",
	" print(expected_counts)\n",
	" print(actual_counts)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[972, 109, 99, 10, 281, 79, 19, 16]\n",
	"[1585, 0, 0, 0, 0, 0, 0, 0]\n"
	]
	}
	],
	"source": [
	"# clearly our model is flawed -- it's merely predicting 'ooo' for everything.\n",
	"inspect(train_set[0], train_set[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def judge(inputs, labels):\n",
	" inputs = Variable(torch.Tensor(inputs))\n",
	" labels = torch.LongTensor(labels)\n",
	" outputs = net(inputs)\n",
	" _, predicted = torch.max(outputs.data, 1) # prediction = arg max_y’ (p_y’)\n",
	" total = predicted.size(0)\n",
	" correct = (predicted == labels).sum()\n",
	" print('Accuracy: %d/%d = %.2d %%' % (correct, total, 100.*correct/total))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy: 972/1585 = 61 %\n"
	]
	}
	],
	"source": [
	"judge(train_set[0], train_set[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy: 74/250 = 29 %\n"
	]
	}
	],
	"source": [
	"judge(test_set[0], test_set[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy: 88/250 = 35 %\n"
	]
	}
	],
	"source": [
	"judge(valid_set[0], valid_set[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[74, 26, 41, 6, 58, 26, 9, 10]\n",
	"[250, 0, 0, 0, 0, 0, 0, 0]\n"
	]
	}
	],
	"source": [
	"inspect(test_set[0], test_set[1])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}