Last active
February 6, 2017 00:59
-
-
Save stevenhao/483f96715ba30d46b4bb03fab0066096 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import os\n", | |
"from random import shuffle, random\n", | |
"from collections import defaultdict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"import urllib.request\n", | |
"import zipfile\n", | |
"import lxml.etree\n", | |
"\n", | |
"# Download the dataset if it's not already there: this may take a minute as it is 75MB\n", | |
"if not os.path.isfile('ted_en-20160408.zip'):\n", | |
" urllib.request.urlretrieve(\"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\", filename=\"ted_en-20160408.zip\")\n", | |
"\n", | |
"# For now, we're only interested in the subtitle text, so let's extract that from the XML:\n", | |
"with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:\n", | |
" doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n", | |
"input_text = '\\n'.join(doc.xpath('//content/text()'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"talks = list((doc.getroot().iterchildren()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_class(keywords):\n", | |
" keywords = [s.strip().lower() for s in keywords.split(',')]\n", | |
" ted = [\n", | |
" ('technology', 'T'),\n", | |
" ('entertainment', 'E'),\n", | |
" ('design', 'D')\n", | |
" ]\n", | |
" return ''.join(y if x in keywords else 'o' for x, y in ted)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# returns all the words in a string, treating non-alphanumeric characters as separators\n", | |
"def get_tokens(s):\n", | |
" tokens = re.sub(r\"[^a-z0-9]+\", \" \", s.lower()).split()\n", | |
" return tokens" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# returns a map from word to frequency\n", | |
"def get_frequencies(lst):\n", | |
" result = defaultdict(int)\n", | |
" for word in lst:\n", | |
" result[word] += 1\n", | |
" return result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"## Extract list of all sentences from TED talk corpus.\n", | |
"## Maybe we should use Wiki instead?\n", | |
"input_text = '\\n'.join(doc.xpath('//content/text()'))\n", | |
"input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sentences_strings_ted = []\n", | |
"for line in input_text_noparens.split('\\n'):\n", | |
" m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)\n", | |
" sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sentences_ted = []\n", | |
"for sent_str in sentences_strings_ted:\n", | |
" sentences_ted.append(get_tokens(sent_str))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"SPECIAL_TOKEN = '*' # a non-alphanumeric token, guaranteed to not occur in the dictionary\n", | |
"\n", | |
"## Randomly map a few low-frequency words to the special token\n", | |
"all_words = []\n", | |
"for sentence in sentences_ted:\n", | |
" all_words.extend(sentence)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"all_freqs = get_frequencies(all_words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"threshold = 2\n", | |
"p = 0.1\n", | |
"# print ('threshold:', threshold)\n", | |
"# print ('p:', p)\n", | |
"cnt = 0\n", | |
"for s in sentences_ted:\n", | |
" for i in range(len(s)):\n", | |
" if all_freqs[s[i]] <= threshold and random() < p:\n", | |
" cnt += 1\n", | |
" s[i] = SPECIAL_TOKEN\n", | |
"# print ('cnt:', cnt)\n", | |
"# print ('total:', len(all_words))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"## Get a Word2Vec model, for embedding our text strings.\n", | |
"from gensim.models import Word2Vec\n", | |
"model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def embed_word(w):\n", | |
" if w not in model_ted:\n", | |
" w = SPECIAL_TOKEN\n", | |
" return model_ted[w]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"''' Simple bag of means embedding model\n", | |
"\n", | |
"Given a list of words W,\n", | |
"x = 1/N * sum(x_w for w in W), where N = len(W)\n", | |
"'''\n", | |
"def bag_of_means(text):\n", | |
" W = get_tokens(text)\n", | |
" return sum(embed_word(w) for w in W) / len(W)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"LABELS = ['ooo', 'ooD', 'oEo', 'oED', 'Too', 'ToD', 'TEo', 'TED']\n", | |
"''' processes a talk\n", | |
"\n", | |
"returns (<embedding> (1D np.array [dim=100]), <label> (int))\n", | |
"'''\n", | |
"def process_talk(talk):\n", | |
" text = talk.cssselect('content')[0].text\n", | |
" label = get_class(talk.cssselect('keywords')[0].text)\n", | |
" return bag_of_means(text), LABELS.index(label)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"processed_talks = [process_talk(t) for t in talks]\n", | |
"all_inputs, all_labels = zip(*processed_talks)\n", | |
"all_inputs, all_labels = np.array(all_inputs), np.array(all_labels)\n", | |
"\n", | |
"NUM_TRAIN = 1585\n", | |
"NUM_VALID = 250\n", | |
"NUM_TEST = 250\n", | |
"assert(NUM_TRAIN+NUM_VALID+NUM_TEST == len(processed_talks))\n", | |
"train_set = all_inputs[0:NUM_TRAIN], all_labels[0:NUM_TRAIN]\n", | |
"valid_set = all_inputs[NUM_TRAIN:NUM_TRAIN+NUM_VALID], all_labels[NUM_TRAIN:NUM_TRAIN+NUM_VALID]\n", | |
"test_set = all_inputs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST], all_labels[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import random\n", | |
"\n", | |
"TRAINING_BATCH_SIZE = 50\n", | |
"''' extracts training data from train_set\n", | |
"\n", | |
"returns list of input, label pairs.\n", | |
"each input, label pair corresponds to a batch of 50 talks.\n", | |
"input: 2D torch.Tensor [dimensions 50x100]\n", | |
"label: 1D torch.Tensor [dimension 50]\n", | |
"'''\n", | |
"def make_training_data():\n", | |
" train_inputs, train_labels = train_set\n", | |
" training_data = []\n", | |
" ar = list(range(NUM_TRAIN))\n", | |
" random.shuffle(ar) # shuffle it each time to make new batches\n", | |
" for i in range(0, NUM_TRAIN, TRAINING_BATCH_SIZE): # this will leave out the last 35 talks, but that's ok\n", | |
" section = ar[i:i+50]\n", | |
" inputs, labels = train_inputs[section], train_labels[section]\n", | |
" # inputs: list (size=50) of np.arrays (of dim 100)\n", | |
" # labels: list (size=50) of integers (between 0...7)\n", | |
" \n", | |
" # we do NOT want one-hot vectors for the labels -- they should just be np.arrays\n", | |
" inputs = torch.Tensor(np.array(inputs))\n", | |
" labels = torch.LongTensor(labels)\n", | |
" training_data.append((inputs, labels))\n", | |
" return training_data\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import torch\n", | |
"import torch.nn as nn\n", | |
"import torch.nn.functional as F\n", | |
"\n", | |
"from torch.autograd import Variable\n", | |
"\n", | |
"class Net(nn.Module):\n", | |
" def __init__(self):\n", | |
" super(Net, self).__init__()\n", | |
" self.lin1 = nn.Linear(100, 32) # model_ted was created as size=100\n", | |
" self.lin2 = nn.Linear(32, 16)\n", | |
" self.lin3 = nn.Linear(16, 8) # there are 8 labels\n", | |
" def forward(self, x):\n", | |
" x = F.relu(self.lin1(x))\n", | |
" x = F.relu(self.lin2(x))\n", | |
" x = F.softmax(self.lin3(x))\n", | |
" return x\n", | |
"\n", | |
"net = Net()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import torch.optim as optim\n", | |
"optimizer = optim.Adam(net.parameters(), lr = 0.01)\n", | |
"criterion = nn.CrossEntropyLoss() # loss = -log(p_y) (cross entropy criterion)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"average loss: 1.759\n", | |
"average loss: 1.660\n", | |
"average loss: 1.662\n", | |
"average loss: 1.659\n", | |
"average loss: 1.661\n", | |
"Finished Training\n" | |
] | |
} | |
], | |
"source": [ | |
"for epoch in range(5): # loop over the dataset multiple times\n", | |
" running_loss = 0.0\n", | |
" training_data = make_training_data()\n", | |
" for data in training_data:\n", | |
" inputs, labels = data\n", | |
"\n", | |
" # wrap them in a variable\n", | |
" inputs, labels = Variable(inputs), Variable(labels)\n", | |
" \n", | |
" # zero the parameter gradients\n", | |
" optimizer.zero_grad()\n", | |
" \n", | |
" # forward + backward + optimize\n", | |
" outputs = net(inputs)\n", | |
" loss = criterion(outputs, labels)\n", | |
" loss.backward()\n", | |
" optimizer.step()\n", | |
" \n", | |
" running_loss += loss.data[0]\n", | |
" # print statistics\n", | |
" print('average loss: %.3f' % (1. * running_loss / len(training_data)))\n", | |
"print('Finished Training')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def inspect(inputs, labels):\n", | |
" inputs = Variable(torch.Tensor(inputs))\n", | |
" labels = torch.LongTensor(labels)\n", | |
" outputs = net(inputs)\n", | |
" _, predicted = torch.max(outputs.data, 1)\n", | |
" actual_counts = [(predicted == i).sum() for i in range(8)]\n", | |
" expected_counts = [(labels == i).sum() for i in range(8)]\n", | |
" print(expected_counts)\n", | |
" print(actual_counts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[972, 109, 99, 10, 281, 79, 19, 16]\n", | |
"[1585, 0, 0, 0, 0, 0, 0, 0]\n" | |
] | |
} | |
], | |
"source": [ | |
"# clearly our model is flawed -- it's merely predicting 'ooo' for everything.\n", | |
"inspect(train_set[0], train_set[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def judge(inputs, labels):\n", | |
" inputs = Variable(torch.Tensor(inputs))\n", | |
" labels = torch.LongTensor(labels)\n", | |
" outputs = net(inputs)\n", | |
" _, predicted = torch.max(outputs.data, 1) # prediction = arg max_y’ (p_y’)\n", | |
" total = predicted.size(0)\n", | |
" correct = (predicted == labels).sum()\n", | |
" print('Accuracy: %d/%d = %.2d %%' % (correct, total, 100.*correct/total))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy: 972/1585 = 61 %\n" | |
] | |
} | |
], | |
"source": [ | |
"judge(train_set[0], train_set[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy: 74/250 = 29 %\n" | |
] | |
} | |
], | |
"source": [ | |
"judge(test_set[0], test_set[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy: 88/250 = 35 %\n" | |
] | |
} | |
], | |
"source": [ | |
"judge(valid_set[0], valid_set[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[74, 26, 41, 6, 58, 26, 9, 10]\n", | |
"[250, 0, 0, 0, 0, 0, 0, 0]\n" | |
] | |
} | |
], | |
"source": [ | |
"inspect(test_set[0], test_set[1])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment