Last active
May 9, 2018 18:12
-
-
Save menshikh-iv/0c691219314da35f48f10826b6d34d97 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"# PyCon Russia 2017\n", | |
"\n", | |
" Talk at [PyCon Russia](http://pycon.ru/2017/program/content/menshih/) by Ivan Menshih and Lev Konstantinovskiy. Slides at speakerdeck(link TBA)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Not exercise code\n", | |
"import logging, gensim, bz2\n", | |
"import numpy as np\n", | |
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"deletable": true, | |
"editable": true | |
}, | |
"source": [ | |
"## Term-document matrix\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:49:30,604 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", | |
"2017-07-15 22:49:30,606 : INFO : built Dictionary(10 unique tokens: ['пушистый', 'кот', 'и', 'бегал', 'котенок']...) from 4 documents (total 17 corpus positions)\n" | |
] | |
} | |
], | |
"source": [ | |
"# pre-supplied code\n", | |
"texts = [ u\"Пушистый котенок мурлыкал.\" ,\n", | |
"u\"Пушистый кот мурлыкал и мяукал.\",\n", | |
"u\"Пушистый котенок мяукал.\",\n", | |
"u\"Громкий пушистый пудель бегал и лаял.\"\n", | |
" ]\n", | |
"\n", | |
"tokenized_texts = [list(gensim.utils.tokenize(t, to_lower=True)) for t in texts]\n", | |
"dictionary = gensim.corpora.Dictionary(tokenized_texts)\n", | |
"corpus = [dictionary.doc2bow(text) for text in tokenized_texts]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[(0, 1), (1, 1), (2, 1)],\n", | |
" [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1)],\n", | |
" [(0, 1), (1, 1), (5, 1)],\n", | |
" [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# term-doc matrix\n", | |
"corpus" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## LSI" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:49:30,647 : INFO : using serial LSI version on this node\n", | |
"2017-07-15 22:49:30,650 : INFO : updating model with new documents\n", | |
"2017-07-15 22:49:30,651 : INFO : using 100 extra samples and 3 power iterations\n", | |
"2017-07-15 22:49:30,654 : INFO : 1st phase: constructing (10, 102) action matrix\n", | |
"2017-07-15 22:49:30,658 : INFO : PROGRESS: at document #0\n", | |
"2017-07-15 22:49:30,679 : INFO : running power iteration #1\n", | |
"2017-07-15 22:49:30,681 : INFO : PROGRESS: at document #0/4\n", | |
"2017-07-15 22:49:30,695 : INFO : running power iteration #2\n", | |
"2017-07-15 22:49:30,697 : INFO : PROGRESS: at document #0/4\n", | |
"2017-07-15 22:49:30,699 : INFO : running power iteration #3\n", | |
"2017-07-15 22:49:30,700 : INFO : PROGRESS: at document #0/4\n", | |
"2017-07-15 22:49:30,703 : INFO : 2nd phase: constructing (10, 10) covariance matrix\n", | |
"2017-07-15 22:49:30,706 : INFO : PROGRESS: at document #0/4\n", | |
"2017-07-15 22:49:30,709 : INFO : running dense decomposition on (10, 10) covariance matrix\n", | |
"2017-07-15 22:49:30,714 : INFO : computing the final decomposition\n", | |
"2017-07-15 22:49:30,716 : INFO : keeping 2 factors (discarding 18.304% of energy spectrum)\n" | |
] | |
} | |
], | |
"source": [ | |
"lsi = gensim.models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary,num_topics=2, onepass=False, power_iters=3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"It's actually 3 matrices involved here. The slides simplify it to two matrices but there are actually 3 matrices involved here. The matrix S contains the singular values.\n", | |
"\n", | |
"`X=U*S*V^T`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.63650543, 0.09726715],\n", | |
" [ 0.2559478 , 0.34331361],\n", | |
" [ 0.32372211, 0.31212204],\n", | |
" [ 0.19574821, 0.14046524],\n", | |
" [ 0.38055763, -0.24604647],\n", | |
" [ 0.32372211, 0.31212204],\n", | |
" [ 0.18480941, -0.3865117 ],\n", | |
" [ 0.18480941, -0.3865117 ],\n", | |
" [ 0.18480941, -0.3865117 ],\n", | |
" [ 0.18480941, -0.3865117 ]])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lsi.projection.u" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.3945107 , 0.35945313],\n", | |
" [ 0.60344153, 0.29413731],\n", | |
" [ 0.3945107 , 0.35945313],\n", | |
" [ 0.56972002, -0.80936403]])" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s\n", | |
"V" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 1.21617532, 0.75270277],\n", | |
" [ 1.86025548, 0.61593002],\n", | |
" [ 1.21617532, 0.75270277],\n", | |
" [ 1.75630069, -1.69482613]])" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# singular values multiplied by docs x dimensions \n", | |
"lsi.projection.s * V" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.84731544, 0.5696905 , 0.62863797, 0.34379272, 0.27762494,\n", | |
" 0.62863797, -0.06616778, -0.06616778, -0.06616778, -0.06616778],\n", | |
" [ 1.24397247, 0.68758546, 0.79445117, 0.45065844, 0.55638701,\n", | |
" 0.79445117, 0.10572856, 0.10572856, 0.10572856, 0.10572856],\n", | |
" [ 0.84731544, 0.5696905 , 0.62863797, 0.34379272, 0.27762494,\n", | |
" 0.62863797, -0.06616778, -0.06616778, -0.06616778, -0.06616778],\n", | |
" [ 0.95304402, -0.13233558, 0.03956078, 0.10572857, 1.0853796 ,\n", | |
" 0.03956078, 0.97965103, 0.97965103, 0.97965103, 0.97965103]])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# we almost get the corpus bag of words back :)\n", | |
"np.dot((lsi.projection.u * lsi.projection.s ),V.T).T" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[[(0, 1), (1, 1), (2, 1)],\n", | |
" [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1)],\n", | |
" [(0, 1), (1, 1), (5, 1)],\n", | |
" [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"corpus" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## LDA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:49:30,834 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", | |
"2017-07-15 22:49:30,841 : INFO : built Dictionary(12 unique tokens: ['и', 'кусался', 'котенок', 'громкий', 'мяукал']...) from 4 documents (total 26 corpus positions)\n" | |
] | |
} | |
], | |
"source": [ | |
"texts = [ u\"Пушистый котенок мурлыкал.\" ,\n", | |
"u\"Пушистый кот мурлыкал и мяукал.\",\n", | |
"u\"Пушистый котенок мяукал.\",\n", | |
"u\"Громкий пушистый пудель бегал и лаял.\"\n", | |
"u\"Большой пудель лаял и кусался.\"\n", | |
"u\"Громкий большой пудель бегал.\"\n", | |
" ]\n", | |
"\n", | |
"tokenized_texts = [list(gensim.utils.tokenize(t, to_lower=True)) for t in texts]\n", | |
"dictionary = gensim.corpora.Dictionary(tokenized_texts)\n", | |
"corpus = [dictionary.doc2bow(text) for text in tokenized_texts]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:49:30,865 : INFO : using symmetric alpha at 0.5\n", | |
"2017-07-15 22:49:30,867 : INFO : using symmetric eta at 0.08333333333333333\n", | |
"2017-07-15 22:49:30,871 : INFO : using serial LDA version on this node\n", | |
"2017-07-15 22:49:30,875 : INFO : running online LDA training, 2 topics, 5 passes over the supplied corpus of 4 documents, updating every 6000 documents, evaluating every ~4 documents, iterating 50x with a convergence threshold of 0.001000\n", | |
"2017-07-15 22:49:30,878 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", | |
"2017-07-15 22:49:30,891 : INFO : training LDA model using 3 processes\n", | |
"2017-07-15 22:49:30,941 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n", | |
"2017-07-15 22:49:31,004 : INFO : topic #0 (0.500): 0.127*\"и\" + 0.118*\"пушистый\" + 0.108*\"пудель\" + 0.087*\"мяукал\" + 0.079*\"лаял\" + 0.077*\"большой\" + 0.077*\"бегал\" + 0.076*\"мурлыкал\" + 0.076*\"громкий\" + 0.062*\"кот\"\n", | |
"2017-07-15 22:49:31,006 : INFO : topic #1 (0.500): 0.148*\"пушистый\" + 0.103*\"котенок\" + 0.102*\"пудель\" + 0.083*\"громкий\" + 0.082*\"мурлыкал\" + 0.082*\"бегал\" + 0.081*\"большой\" + 0.079*\"и\" + 0.079*\"лаял\" + 0.070*\"мяукал\"\n", | |
"2017-07-15 22:49:31,009 : INFO : topic diff=0.344070, rho=1.000000\n", | |
"2017-07-15 22:49:31,029 : INFO : -3.118 per-word bound, 8.7 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n", | |
"2017-07-15 22:49:31,032 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n", | |
"2017-07-15 22:49:31,113 : INFO : topic #0 (0.500): 0.131*\"и\" + 0.125*\"пудель\" + 0.104*\"пушистый\" + 0.090*\"лаял\" + 0.089*\"большой\" + 0.089*\"бегал\" + 0.089*\"громкий\" + 0.070*\"мяукал\" + 0.064*\"мурлыкал\" + 0.058*\"кот\"\n", | |
"2017-07-15 22:49:31,115 : INFO : topic #1 (0.500): 0.180*\"пушистый\" + 0.159*\"котенок\" + 0.106*\"мурлыкал\" + 0.094*\"мяукал\" + 0.070*\"пудель\" + 0.061*\"громкий\" + 0.061*\"бегал\" + 0.061*\"большой\" + 0.059*\"и\" + 0.059*\"лаял\"\n", | |
"2017-07-15 22:49:31,116 : INFO : topic diff=0.344751, rho=0.706753\n", | |
"2017-07-15 22:49:31,129 : INFO : -2.970 per-word bound, 7.8 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n", | |
"2017-07-15 22:49:31,147 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n", | |
"2017-07-15 22:49:31,162 : INFO : topic #0 (0.500): 0.134*\"и\" + 0.133*\"пудель\" + 0.095*\"лаял\" + 0.095*\"пушистый\" + 0.095*\"большой\" + 0.095*\"бегал\" + 0.095*\"громкий\" + 0.060*\"мяукал\" + 0.058*\"кусался\" + 0.057*\"кот\"\n", | |
"2017-07-15 22:49:31,167 : INFO : topic #1 (0.500): 0.199*\"пушистый\" + 0.178*\"котенок\" + 0.123*\"мурлыкал\" + 0.114*\"мяукал\" + 0.053*\"пудель\" + 0.051*\"и\" + 0.049*\"громкий\" + 0.049*\"бегал\" + 0.049*\"большой\" + 0.048*\"лаял\"\n", | |
"2017-07-15 22:49:31,171 : INFO : topic diff=0.214595, rho=0.577158\n", | |
"2017-07-15 22:49:31,182 : INFO : -2.916 per-word bound, 7.5 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n", | |
"2017-07-15 22:49:31,187 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n", | |
"2017-07-15 22:49:31,198 : INFO : PROGRESS: pass 4, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n", | |
"2017-07-15 22:49:31,207 : INFO : topic #0 (0.500): 0.140*\"пудель\" + 0.138*\"и\" + 0.100*\"лаял\" + 0.100*\"большой\" + 0.100*\"бегал\" + 0.100*\"громкий\" + 0.088*\"пушистый\" + 0.061*\"кусался\" + 0.054*\"кот\" + 0.050*\"мяукал\"\n", | |
"2017-07-15 22:49:31,217 : INFO : topic #1 (0.500): 0.208*\"пушистый\" + 0.175*\"котенок\" + 0.136*\"мурлыкал\" + 0.130*\"мяукал\" + 0.050*\"кот\" + 0.049*\"и\" + 0.045*\"пудель\" + 0.043*\"громкий\" + 0.042*\"бегал\" + 0.042*\"большой\"\n", | |
"2017-07-15 22:49:31,222 : INFO : topic diff=0.146445, rho=0.447124\n", | |
"2017-07-15 22:49:31,236 : INFO : -2.881 per-word bound, 7.4 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n" | |
] | |
} | |
], | |
"source": [ | |
"# pre-supplied code\n", | |
"lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=2, passes=5, random_state=42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0,\n", | |
" '0.140*\"пудель\" + 0.138*\"и\" + 0.100*\"лаял\" + 0.100*\"большой\" + 0.100*\"бегал\" + 0.100*\"громкий\" + 0.088*\"пушистый\" + 0.061*\"кусался\" + 0.054*\"кот\" + 0.050*\"мяукал\"'),\n", | |
" (1,\n", | |
" '0.208*\"пушистый\" + 0.175*\"котенок\" + 0.136*\"мурлыкал\" + 0.130*\"мяукал\" + 0.050*\"кот\" + 0.049*\"и\" + 0.045*\"пудель\" + 0.043*\"громкий\" + 0.042*\"бегал\" + 0.042*\"большой\"')]" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lda.show_topics()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# this is a sample method to color words. Like mentioned before, there are many ways to do this.\n", | |
"\n", | |
"def color_words(model, doc):\n", | |
" import matplotlib.pyplot as plt\n", | |
" import matplotlib.patches as patches\n", | |
" \n", | |
" # make into bag of words\n", | |
" filtered_doc = model.id2word.doc2bow(doc)\n", | |
" # get word_topics\n", | |
" doc_topics, word_topics, phi_values = model.get_document_topics(filtered_doc, per_word_topics=True)\n", | |
" top_word_topic = dict([(word, topics[0]) for word, topics in word_topics])\n", | |
"\n", | |
" # color-topic matching\n", | |
" topic_colors = { 1:'red', 0:'blue'}\n", | |
" \n", | |
" # set up fig to plot\n", | |
" fig = plt.figure()\n", | |
" ax = fig.add_axes([0,0,1,1])\n", | |
"\n", | |
" # a sort of hack to make sure the words are well spaced out.\n", | |
" word_pos = 1/len(doc)\n", | |
" \n", | |
" # use matplotlib to plot words\n", | |
" for word in doc:\n", | |
" color = 'black' \n", | |
" if word in model.id2word.token2id: \n", | |
" word_id = model.id2word.token2id[word]\n", | |
" if word_id in top_word_topic:\n", | |
" color=topic_colors[top_word_topic[word_id]]\n", | |
"\n", | |
" ax.text(word_pos, 0.8, word,\n", | |
" horizontalalignment='center',\n", | |
" verticalalignment='center',\n", | |
" fontsize=20, color=color, # choose just the most likely topic\n", | |
" transform=ax.transAxes)\n", | |
" word_pos += 0.2 # to move the word for the next iter\n", | |
"\n", | |
" ax.set_axis_off()\n", | |
" plt.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"new_doc = \"\"\"У меня живут кот и пудель: старшему кот Афину 2 года 3 месяца, а пудель Мисти 8 месяцев. \n", | |
"Первым в наш дом попал Мисти тогда ему было не больше 2 месяцев. Потом попал кот когда ему было 6 месяцев.\n", | |
"Он много мурлыкал, был пушистый и добрый. В это время у нас уже жил пудель Мисти и он много на кот лаял и кусался.\n", | |
"\n", | |
"\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [], | |
"source": [ | |
"color_words(lda, list(gensim.utils.tokenize(new_doc, to_lower=True)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Doc2vec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from gensim.models.doc2vec import TaggedDocument, Doc2Vec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[TaggedDocument(words=['пушистый', 'котенок', 'мурлыкал'], tags=[0]),\n", | |
" TaggedDocument(words=['пушистый', 'кот', 'мурлыкал', 'и', 'мяукал'], tags=[1]),\n", | |
" TaggedDocument(words=['пушистый', 'котенок', 'мяукал'], tags=[2]),\n", | |
" TaggedDocument(words=['громкий', 'пушистый', 'пудель', 'бегал', 'и', 'лаял', 'большой', 'пудель', 'лаял', 'и', 'кусался', 'громкий', 'большой', 'пудель', 'бегал'], tags=[3])]" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_texts)] \n", | |
"train_corpus" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:57:25,100 : WARNING : consider setting layer size to a multiple of 4 for greater performance\n", | |
"2017-07-15 22:57:25,103 : INFO : collecting all words and their counts\n", | |
"2017-07-15 22:57:25,104 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags\n", | |
"2017-07-15 22:57:25,105 : INFO : collected 12 word types and 4 unique tags from a corpus of 4 examples and 26 words\n", | |
"2017-07-15 22:57:25,106 : INFO : Loading a fresh vocabulary\n", | |
"2017-07-15 22:57:25,107 : INFO : min_count=1 retains 12 unique words (100% of original 12, drops 0)\n", | |
"2017-07-15 22:57:25,108 : INFO : min_count=1 leaves 26 word corpus (100% of original 26, drops 0)\n", | |
"2017-07-15 22:57:25,109 : INFO : deleting the raw counts dictionary of 12 items\n", | |
"2017-07-15 22:57:25,110 : INFO : sample=0.001 downsamples 12 most-common words\n", | |
"2017-07-15 22:57:25,111 : INFO : downsampling leaves estimated 3 word corpus (12.0% of prior 26)\n", | |
"2017-07-15 22:57:25,112 : INFO : estimated required memory for 12 words and 3 dimensions: 6336 bytes\n", | |
"2017-07-15 22:57:25,113 : INFO : resetting layer weights\n", | |
"2017-07-15 22:57:25,114 : INFO : training model with 3 workers on 12 vocabulary and 3 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", | |
"2017-07-15 22:57:25,118 : INFO : worker thread finished; awaiting finish of 2 more threads\n", | |
"2017-07-15 22:57:25,119 : INFO : worker thread finished; awaiting finish of 1 more threads\n", | |
"2017-07-15 22:57:25,127 : INFO : worker thread finished; awaiting finish of 0 more threads\n", | |
"2017-07-15 22:57:25,128 : INFO : training on 2600 raw words (713 effective words) took 0.0s, 65986 effective words/s\n", | |
"2017-07-15 22:57:25,129 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n", | |
"2017-07-15 22:57:25,130 : WARNING : supplied example count (400) did not equal expected count (300)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"713" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d2v = Doc2Vec(size=3, min_count=1)\n", | |
"d2v.build_vocab(train_corpus)\n", | |
"d2v.train(train_corpus, total_examples=3, epochs=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.15008539, 0.1174393 , -0.18301599], dtype=float32)" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d2v.docvecs[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(2, 0.9999999403953552),\n", | |
" (1, 0.8357436656951904),\n", | |
" (0, 0.35743752121925354),\n", | |
" (3, 0.23696193099021912)]" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d2v.docvecs.most_similar(positive=[d2v.docvecs[2]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false, | |
"deletable": true, | |
"editable": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 22:59:19,093 : INFO : precomputing L2-norms of word weight vectors\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('мяукал', 0.9649645090103149),\n", | |
" ('большой', 0.8888713121414185),\n", | |
" ('пушистый', 0.8822481632232666),\n", | |
" ('кот', 0.6684380769729614),\n", | |
" ('и', 0.5814030170440674),\n", | |
" ('бегал', 0.046333640813827515),\n", | |
" ('лаял', -0.005524665117263794),\n", | |
" ('мурлыкал', -0.34848976135253906),\n", | |
" ('кусался', -0.46476083993911743),\n", | |
" ('пудель', -0.5001183748245239)]" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d2v.most_similar('котенок')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('кусался', 0.9931982755661011),\n", | |
" ('громкий', 0.8554766178131104),\n", | |
" ('и', 0.3850504159927368),\n", | |
" ('бегал', 0.32188186049461365),\n", | |
" ('мурлыкал', 0.17445510625839233),\n", | |
" ('лаял', -0.09391497075557709),\n", | |
" ('пушистый', -0.15622930228710175),\n", | |
" ('котенок', -0.5001183748245239),\n", | |
" ('мяукал', -0.6953105926513672),\n", | |
" ('большой', -0.8411158323287964)]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"d2v.most_similar('пудель')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Tensorboard viz\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-15 23:53:17,181 : INFO : storing 4x3 projection weights into doc_tensor.w2v\n" | |
] | |
} | |
], | |
"source": [ | |
"d2v.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2017-07-16 00:06:53,842 : INFO : storing 12x3 projection weights into word_tensor.w2v\n" | |
] | |
} | |
], | |
"source": [ | |
"d2v.save_word2vec_format('word_tensor.w2v', doctag_vec=False, word_vec=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/home/lev/Dropbox/raretech/online_courses/datacamp_topicmod/courses-topic-modeling-with-gensim/notebooks\r\n" | |
] | |
} | |
], | |
"source": [ | |
"# follow https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Tensorboard_visualizations.ipynb\n", | |
"!python ../../gensim/scripts/word2vec2tensor.py -i word_tensor.w2v -o cats" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"See [tensorboard viz](http://projector.tensorflow.org/?config=https://gist.githubusercontent.com/tmylk/ce68f87365d4df2a5c6da4c375016f71/raw/31302d6a1c5edc9a7594871777c9353e3a113ad6/tb_config.json)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "airpub", | |
"language": "python", | |
"name": "airpub" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment