Skip to content

Instantly share code, notes, and snippets.

@stevenhao
Last active February 6, 2017 00:59
Show Gist options
  • Save stevenhao/483f96715ba30d46b4bb03fab0066096 to your computer and use it in GitHub Desktop.
Save stevenhao/483f96715ba30d46b4bb03fab0066096 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"from random import shuffle, random\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"import urllib.request\n",
"import zipfile\n",
"import lxml.etree\n",
"\n",
"# Download the dataset if it's not already there: this may take a minute as it is 75MB\n",
"if not os.path.isfile('ted_en-20160408.zip'):\n",
" urllib.request.urlretrieve(\"https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip\", filename=\"ted_en-20160408.zip\")\n",
"\n",
"# For now, we're only interested in the subtitle text, so let's extract that from the XML:\n",
"with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:\n",
" doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))\n",
"input_text = '\\n'.join(doc.xpath('//content/text()'))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"talks = list((doc.getroot().iterchildren()))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_class(keywords):\n",
" keywords = [s.strip().lower() for s in keywords.split(',')]\n",
" ted = [\n",
" ('technology', 'T'),\n",
" ('entertainment', 'E'),\n",
" ('design', 'D')\n",
" ]\n",
" return ''.join(y if x in keywords else 'o' for x, y in ted)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# returns all the words in a string, treating non-alphanumeric characters as separators\n",
"def get_tokens(s):\n",
" tokens = re.sub(r\"[^a-z0-9]+\", \" \", s.lower()).split()\n",
" return tokens"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# returns a map from word to frequency\n",
"def get_frequencies(lst):\n",
" result = defaultdict(int)\n",
" for word in lst:\n",
" result[word] += 1\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Extract list of all sentences from TED talk corpus.\n",
"## Maybe we should use Wiki instead?\n",
"input_text = '\\n'.join(doc.xpath('//content/text()'))\n",
"input_text_noparens = re.sub(r'\\([^)]*\\)', '', input_text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sentences_strings_ted = []\n",
"for line in input_text_noparens.split('\\n'):\n",
" m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)\n",
" sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sentences_ted = []\n",
"for sent_str in sentences_strings_ted:\n",
" sentences_ted.append(get_tokens(sent_str))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"SPECIAL_TOKEN = '*' # a non-alphanumeric token, guaranteed to not occur in the dictionary\n",
"\n",
"## Randomly map a few low-frequency words to the special token\n",
"all_words = []\n",
"for sentence in sentences_ted:\n",
" all_words.extend(sentence)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"all_freqs = get_frequencies(all_words)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"threshold = 2\n",
"p = 0.1\n",
"# print ('threshold:', threshold)\n",
"# print ('p:', p)\n",
"cnt = 0\n",
"for s in sentences_ted:\n",
" for i in range(len(s)):\n",
" if all_freqs[s[i]] <= threshold and random() < p:\n",
" cnt += 1\n",
" s[i] = SPECIAL_TOKEN\n",
"# print ('cnt:', cnt)\n",
"# print ('total:', len(all_words))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Get a Word2Vec model, for embedding our text strings.\n",
"from gensim.models import Word2Vec\n",
"model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def embed_word(w):\n",
" if w not in model_ted:\n",
" w = SPECIAL_TOKEN\n",
" return model_ted[w]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"''' Simple bag of means embedding model\n",
"\n",
"Given a list of words W,\n",
"x = 1/N * sum(x_w for w in W), where N = len(W)\n",
"'''\n",
"def bag_of_means(text):\n",
" W = get_tokens(text)\n",
" return sum(embed_word(w) for w in W) / len(W)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"LABELS = ['ooo', 'ooD', 'oEo', 'oED', 'Too', 'ToD', 'TEo', 'TED']\n",
"''' processes a talk\n",
"\n",
"returns (<embedding> (1D np.array [dim=100]), <label> (int))\n",
"'''\n",
"def process_talk(talk):\n",
" text = talk.cssselect('content')[0].text\n",
" label = get_class(talk.cssselect('keywords')[0].text)\n",
" return bag_of_means(text), LABELS.index(label)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"processed_talks = [process_talk(t) for t in talks]\n",
"all_inputs, all_labels = zip(*processed_talks)\n",
"all_inputs, all_labels = np.array(all_inputs), np.array(all_labels)\n",
"\n",
"NUM_TRAIN = 1585\n",
"NUM_VALID = 250\n",
"NUM_TEST = 250\n",
"assert(NUM_TRAIN+NUM_VALID+NUM_TEST == len(processed_talks))\n",
"train_set = all_inputs[0:NUM_TRAIN], all_labels[0:NUM_TRAIN]\n",
"valid_set = all_inputs[NUM_TRAIN:NUM_TRAIN+NUM_VALID], all_labels[NUM_TRAIN:NUM_TRAIN+NUM_VALID]\n",
"test_set = all_inputs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST], all_labels[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import random\n",
"\n",
"TRAINING_BATCH_SIZE = 50\n",
"''' extracts training data from train_set\n",
"\n",
"returns list of input, label pairs.\n",
"each input, label pair corresponds to a batch of 50 talks.\n",
"input: 2D torch.Tensor [dimensions 50x100]\n",
"label: 1D torch.Tensor [dimension 50]\n",
"'''\n",
"def make_training_data():\n",
" train_inputs, train_labels = train_set\n",
" training_data = []\n",
" ar = list(range(NUM_TRAIN))\n",
" random.shuffle(ar) # shuffle it each time to make new batches\n",
" for i in range(0, NUM_TRAIN, TRAINING_BATCH_SIZE): # this will leave out the last 35 talks, but that's ok\n",
" section = ar[i:i+50]\n",
" inputs, labels = train_inputs[section], train_labels[section]\n",
" # inputs: list (size=50) of np.arrays (of dim 100)\n",
" # labels: list (size=50) of integers (between 0...7)\n",
" \n",
" # we do NOT want one-hot vectors for the labels -- they should just be np.arrays\n",
" inputs = torch.Tensor(np.array(inputs))\n",
" labels = torch.LongTensor(labels)\n",
" training_data.append((inputs, labels))\n",
" return training_data\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"from torch.autograd import Variable\n",
"\n",
"class Net(nn.Module):\n",
" def __init__(self):\n",
" super(Net, self).__init__()\n",
" self.lin1 = nn.Linear(100, 32) # model_ted was created as size=100\n",
" self.lin2 = nn.Linear(32, 16)\n",
" self.lin3 = nn.Linear(16, 8) # there are 8 labels\n",
" def forward(self, x):\n",
" x = F.relu(self.lin1(x))\n",
" x = F.relu(self.lin2(x))\n",
" x = F.softmax(self.lin3(x))\n",
" return x\n",
"\n",
"net = Net()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import torch.optim as optim\n",
"optimizer = optim.Adam(net.parameters(), lr = 0.01)\n",
"criterion = nn.CrossEntropyLoss() # loss = -log(p_y) (cross entropy criterion)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"average loss: 1.759\n",
"average loss: 1.660\n",
"average loss: 1.662\n",
"average loss: 1.659\n",
"average loss: 1.661\n",
"Finished Training\n"
]
}
],
"source": [
"for epoch in range(5): # loop over the dataset multiple times\n",
" running_loss = 0.0\n",
" training_data = make_training_data()\n",
" for data in training_data:\n",
" inputs, labels = data\n",
"\n",
" # wrap them in a variable\n",
" inputs, labels = Variable(inputs), Variable(labels)\n",
" \n",
" # zero the parameter gradients\n",
" optimizer.zero_grad()\n",
" \n",
" # forward + backward + optimize\n",
" outputs = net(inputs)\n",
" loss = criterion(outputs, labels)\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" running_loss += loss.data[0]\n",
" # print statistics\n",
" print('average loss: %.3f' % (1. * running_loss / len(training_data)))\n",
"print('Finished Training')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def inspect(inputs, labels):\n",
" inputs = Variable(torch.Tensor(inputs))\n",
" labels = torch.LongTensor(labels)\n",
" outputs = net(inputs)\n",
" _, predicted = torch.max(outputs.data, 1)\n",
" actual_counts = [(predicted == i).sum() for i in range(8)]\n",
" expected_counts = [(labels == i).sum() for i in range(8)]\n",
" print(expected_counts)\n",
" print(actual_counts)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[972, 109, 99, 10, 281, 79, 19, 16]\n",
"[1585, 0, 0, 0, 0, 0, 0, 0]\n"
]
}
],
"source": [
"# clearly our model is flawed -- it's merely predicting 'ooo' for everything.\n",
"inspect(train_set[0], train_set[1])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def judge(inputs, labels):\n",
" inputs = Variable(torch.Tensor(inputs))\n",
" labels = torch.LongTensor(labels)\n",
" outputs = net(inputs)\n",
" _, predicted = torch.max(outputs.data, 1) # prediction = arg max_y’ (p_y’)\n",
" total = predicted.size(0)\n",
" correct = (predicted == labels).sum()\n",
" print('Accuracy: %d/%d = %.2d %%' % (correct, total, 100.*correct/total))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 972/1585 = 61 %\n"
]
}
],
"source": [
"judge(train_set[0], train_set[1])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 74/250 = 29 %\n"
]
}
],
"source": [
"judge(test_set[0], test_set[1])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 88/250 = 35 %\n"
]
}
],
"source": [
"judge(valid_set[0], valid_set[1])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[74, 26, 41, 6, 58, 26, 9, 10]\n",
"[250, 0, 0, 0, 0, 0, 0, 0]\n"
]
}
],
"source": [
"inspect(test_set[0], test_set[1])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment