menshikh-iv · May 9, 2018 18:12
diff --git a/PyConRussia2017_Gensim.ipynb b/PyConRussia2017_Gensim.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# PyCon Russia 2017\n",
    "\n",
    " Talk at [PyCon Russia](http://pycon.ru/2017/program/content/menshih/) by Ivan Menshih and Lev Konstantinovskiy. Slides at speakerdeck(link TBA)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Not exercise code\n",
    "import logging, gensim, bz2\n",
    "import numpy as np\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Term-document matrix\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:49:30,604 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
      "2017-07-15 22:49:30,606 : INFO : built Dictionary(10 unique tokens: ['пушистый', 'кот', 'и', 'бегал', 'котенок']...) from 4 documents (total 17 corpus positions)\n"
     ]
    }
   ],
   "source": [
    "# pre-supplied code\n",
    "texts = [ u\"Пушистый котенок мурлыкал.\" ,\n",
    "u\"Пушистый кот мурлыкал и мяукал.\",\n",
    "u\"Пушистый котенок мяукал.\",\n",
    "u\"Громкий пушистый пудель бегал и лаял.\"\n",
    "          ]\n",
    "\n",
    "tokenized_texts = [list(gensim.utils.tokenize(t, to_lower=True)) for t in texts]\n",
    "dictionary = gensim.corpora.Dictionary(tokenized_texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in tokenized_texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[(0, 1), (1, 1), (2, 1)],\n",
       " [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1)],\n",
       " [(0, 1), (1, 1), (5, 1)],\n",
       " [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# term-doc matrix\n",
    "corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LSI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:49:30,647 : INFO : using serial LSI version on this node\n",
      "2017-07-15 22:49:30,650 : INFO : updating model with new documents\n",
      "2017-07-15 22:49:30,651 : INFO : using 100 extra samples and 3 power iterations\n",
      "2017-07-15 22:49:30,654 : INFO : 1st phase: constructing (10, 102) action matrix\n",
      "2017-07-15 22:49:30,658 : INFO : PROGRESS: at document #0\n",
      "2017-07-15 22:49:30,679 : INFO : running power iteration #1\n",
      "2017-07-15 22:49:30,681 : INFO : PROGRESS: at document #0/4\n",
      "2017-07-15 22:49:30,695 : INFO : running power iteration #2\n",
      "2017-07-15 22:49:30,697 : INFO : PROGRESS: at document #0/4\n",
      "2017-07-15 22:49:30,699 : INFO : running power iteration #3\n",
      "2017-07-15 22:49:30,700 : INFO : PROGRESS: at document #0/4\n",
      "2017-07-15 22:49:30,703 : INFO : 2nd phase: constructing (10, 10) covariance matrix\n",
      "2017-07-15 22:49:30,706 : INFO : PROGRESS: at document #0/4\n",
      "2017-07-15 22:49:30,709 : INFO : running dense decomposition on (10, 10) covariance matrix\n",
      "2017-07-15 22:49:30,714 : INFO : computing the final decomposition\n",
      "2017-07-15 22:49:30,716 : INFO : keeping 2 factors (discarding 18.304% of energy spectrum)\n"
     ]
    }
   ],
   "source": [
    "lsi = gensim.models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary,num_topics=2, onepass=False, power_iters=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's actually 3 matrices involved here. The slides simplify it to two matrices but there are actually 3 matrices involved here. The matrix S contains the singular values.\n",
    "\n",
    "`X=U*S*V^T`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.63650543,  0.09726715],\n",
       "       [ 0.2559478 ,  0.34331361],\n",
       "       [ 0.32372211,  0.31212204],\n",
       "       [ 0.19574821,  0.14046524],\n",
       "       [ 0.38055763, -0.24604647],\n",
       "       [ 0.32372211,  0.31212204],\n",
       "       [ 0.18480941, -0.3865117 ],\n",
       "       [ 0.18480941, -0.3865117 ],\n",
       "       [ 0.18480941, -0.3865117 ],\n",
       "       [ 0.18480941, -0.3865117 ]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lsi.projection.u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.3945107 ,  0.35945313],\n",
       "       [ 0.60344153,  0.29413731],\n",
       "       [ 0.3945107 ,  0.35945313],\n",
       "       [ 0.56972002, -0.80936403]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s\n",
    "V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.21617532,  0.75270277],\n",
       "       [ 1.86025548,  0.61593002],\n",
       "       [ 1.21617532,  0.75270277],\n",
       "       [ 1.75630069, -1.69482613]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# singular values multiplied by docs x dimensions \n",
    "lsi.projection.s  * V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.84731544,  0.5696905 ,  0.62863797,  0.34379272,  0.27762494,\n",
       "         0.62863797, -0.06616778, -0.06616778, -0.06616778, -0.06616778],\n",
       "       [ 1.24397247,  0.68758546,  0.79445117,  0.45065844,  0.55638701,\n",
       "         0.79445117,  0.10572856,  0.10572856,  0.10572856,  0.10572856],\n",
       "       [ 0.84731544,  0.5696905 ,  0.62863797,  0.34379272,  0.27762494,\n",
       "         0.62863797, -0.06616778, -0.06616778, -0.06616778, -0.06616778],\n",
       "       [ 0.95304402, -0.13233558,  0.03956078,  0.10572857,  1.0853796 ,\n",
       "         0.03956078,  0.97965103,  0.97965103,  0.97965103,  0.97965103]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we almost get the corpus bag of words back :)\n",
    "np.dot((lsi.projection.u *  lsi.projection.s ),V.T).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[(0, 1), (1, 1), (2, 1)],\n",
       " [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1)],\n",
       " [(0, 1), (1, 1), (5, 1)],\n",
       " [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:49:30,834 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
      "2017-07-15 22:49:30,841 : INFO : built Dictionary(12 unique tokens: ['и', 'кусался', 'котенок', 'громкий', 'мяукал']...) from 4 documents (total 26 corpus positions)\n"
     ]
    }
   ],
   "source": [
    "texts = [ u\"Пушистый котенок мурлыкал.\" ,\n",
    "u\"Пушистый кот мурлыкал и мяукал.\",\n",
    "u\"Пушистый котенок мяукал.\",\n",
    "u\"Громкий пушистый пудель бегал и лаял.\"\n",
    "u\"Большой пудель лаял и кусался.\"\n",
    "u\"Громкий большой пудель бегал.\"\n",
    "          ]\n",
    "\n",
    "tokenized_texts = [list(gensim.utils.tokenize(t, to_lower=True)) for t in texts]\n",
    "dictionary = gensim.corpora.Dictionary(tokenized_texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in tokenized_texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:49:30,865 : INFO : using symmetric alpha at 0.5\n",
      "2017-07-15 22:49:30,867 : INFO : using symmetric eta at 0.08333333333333333\n",
      "2017-07-15 22:49:30,871 : INFO : using serial LDA version on this node\n",
      "2017-07-15 22:49:30,875 : INFO : running online LDA training, 2 topics, 5 passes over the supplied corpus of 4 documents, updating every 6000 documents, evaluating every ~4 documents, iterating 50x with a convergence threshold of 0.001000\n",
      "2017-07-15 22:49:30,878 : WARNING : too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n",
      "2017-07-15 22:49:30,891 : INFO : training LDA model using 3 processes\n",
      "2017-07-15 22:49:30,941 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n",
      "2017-07-15 22:49:31,004 : INFO : topic #0 (0.500): 0.127*\"и\" + 0.118*\"пушистый\" + 0.108*\"пудель\" + 0.087*\"мяукал\" + 0.079*\"лаял\" + 0.077*\"большой\" + 0.077*\"бегал\" + 0.076*\"мурлыкал\" + 0.076*\"громкий\" + 0.062*\"кот\"\n",
      "2017-07-15 22:49:31,006 : INFO : topic #1 (0.500): 0.148*\"пушистый\" + 0.103*\"котенок\" + 0.102*\"пудель\" + 0.083*\"громкий\" + 0.082*\"мурлыкал\" + 0.082*\"бегал\" + 0.081*\"большой\" + 0.079*\"и\" + 0.079*\"лаял\" + 0.070*\"мяукал\"\n",
      "2017-07-15 22:49:31,009 : INFO : topic diff=0.344070, rho=1.000000\n",
      "2017-07-15 22:49:31,029 : INFO : -3.118 per-word bound, 8.7 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n",
      "2017-07-15 22:49:31,032 : INFO : PROGRESS: pass 1, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n",
      "2017-07-15 22:49:31,113 : INFO : topic #0 (0.500): 0.131*\"и\" + 0.125*\"пудель\" + 0.104*\"пушистый\" + 0.090*\"лаял\" + 0.089*\"большой\" + 0.089*\"бегал\" + 0.089*\"громкий\" + 0.070*\"мяукал\" + 0.064*\"мурлыкал\" + 0.058*\"кот\"\n",
      "2017-07-15 22:49:31,115 : INFO : topic #1 (0.500): 0.180*\"пушистый\" + 0.159*\"котенок\" + 0.106*\"мурлыкал\" + 0.094*\"мяукал\" + 0.070*\"пудель\" + 0.061*\"громкий\" + 0.061*\"бегал\" + 0.061*\"большой\" + 0.059*\"и\" + 0.059*\"лаял\"\n",
      "2017-07-15 22:49:31,116 : INFO : topic diff=0.344751, rho=0.706753\n",
      "2017-07-15 22:49:31,129 : INFO : -2.970 per-word bound, 7.8 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n",
      "2017-07-15 22:49:31,147 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n",
      "2017-07-15 22:49:31,162 : INFO : topic #0 (0.500): 0.134*\"и\" + 0.133*\"пудель\" + 0.095*\"лаял\" + 0.095*\"пушистый\" + 0.095*\"большой\" + 0.095*\"бегал\" + 0.095*\"громкий\" + 0.060*\"мяукал\" + 0.058*\"кусался\" + 0.057*\"кот\"\n",
      "2017-07-15 22:49:31,167 : INFO : topic #1 (0.500): 0.199*\"пушистый\" + 0.178*\"котенок\" + 0.123*\"мурлыкал\" + 0.114*\"мяукал\" + 0.053*\"пудель\" + 0.051*\"и\" + 0.049*\"громкий\" + 0.049*\"бегал\" + 0.049*\"большой\" + 0.048*\"лаял\"\n",
      "2017-07-15 22:49:31,171 : INFO : topic diff=0.214595, rho=0.577158\n",
      "2017-07-15 22:49:31,182 : INFO : -2.916 per-word bound, 7.5 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n",
      "2017-07-15 22:49:31,187 : INFO : PROGRESS: pass 3, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n",
      "2017-07-15 22:49:31,198 : INFO : PROGRESS: pass 4, dispatched chunk #0 = documents up to #4/4, outstanding queue size 1\n",
      "2017-07-15 22:49:31,207 : INFO : topic #0 (0.500): 0.140*\"пудель\" + 0.138*\"и\" + 0.100*\"лаял\" + 0.100*\"большой\" + 0.100*\"бегал\" + 0.100*\"громкий\" + 0.088*\"пушистый\" + 0.061*\"кусался\" + 0.054*\"кот\" + 0.050*\"мяукал\"\n",
      "2017-07-15 22:49:31,217 : INFO : topic #1 (0.500): 0.208*\"пушистый\" + 0.175*\"котенок\" + 0.136*\"мурлыкал\" + 0.130*\"мяукал\" + 0.050*\"кот\" + 0.049*\"и\" + 0.045*\"пудель\" + 0.043*\"громкий\" + 0.042*\"бегал\" + 0.042*\"большой\"\n",
      "2017-07-15 22:49:31,222 : INFO : topic diff=0.146445, rho=0.447124\n",
      "2017-07-15 22:49:31,236 : INFO : -2.881 per-word bound, 7.4 perplexity estimate based on a held-out corpus of 4 documents with 26 words\n"
     ]
    }
   ],
   "source": [
    "# pre-supplied code\n",
    "lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=2, passes=5, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.140*\"пудель\" + 0.138*\"и\" + 0.100*\"лаял\" + 0.100*\"большой\" + 0.100*\"бегал\" + 0.100*\"громкий\" + 0.088*\"пушистый\" + 0.061*\"кусался\" + 0.054*\"кот\" + 0.050*\"мяукал\"'),\n",
       " (1,\n",
       "  '0.208*\"пушистый\" + 0.175*\"котенок\" + 0.136*\"мурлыкал\" + 0.130*\"мяукал\" + 0.050*\"кот\" + 0.049*\"и\" + 0.045*\"пудель\" + 0.043*\"громкий\" + 0.042*\"бегал\" + 0.042*\"большой\"')]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lda.show_topics()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# this is a sample method to color words. Like mentioned before, there are many ways to do this.\n",
    "\n",
    "def color_words(model, doc):\n",
    "    import matplotlib.pyplot as plt\n",
    "    import matplotlib.patches as patches\n",
    "    \n",
    "    # make into bag of words\n",
    "    filtered_doc = model.id2word.doc2bow(doc)\n",
    "    # get word_topics\n",
    "    doc_topics, word_topics, phi_values = model.get_document_topics(filtered_doc, per_word_topics=True)\n",
    "    top_word_topic = dict([(word, topics[0]) for word, topics in word_topics])\n",
    "\n",
    "    # color-topic matching\n",
    "    topic_colors = { 1:'red', 0:'blue'}\n",
    "    \n",
    "    # set up fig to plot\n",
    "    fig = plt.figure()\n",
    "    ax = fig.add_axes([0,0,1,1])\n",
    "\n",
    "    # a sort of hack to make sure the words are well spaced out.\n",
    "    word_pos = 1/len(doc)\n",
    "    \n",
    "    # use matplotlib to plot words\n",
    "    for word in doc:\n",
    "        color = 'black'   \n",
    "        if word in model.id2word.token2id: \n",
    "            word_id = model.id2word.token2id[word]\n",
    "            if word_id in top_word_topic:\n",
    "                color=topic_colors[top_word_topic[word_id]]\n",
    "\n",
    "        ax.text(word_pos, 0.8, word,\n",
    "                horizontalalignment='center',\n",
    "                verticalalignment='center',\n",
    "                fontsize=20, color=color,  # choose just the most likely topic\n",
    "                transform=ax.transAxes)\n",
    "        word_pos += 0.2 # to move the word for the next iter\n",
    "\n",
    "    ax.set_axis_off()\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "new_doc = \"\"\"У меня живут кот и пудель: старшему кот Афину 2 года 3 месяца, а пудель Мисти 8 месяцев. \n",
    "Первым в наш дом попал Мисти тогда ему было не больше 2 месяцев. Потом попал кот когда ему было 6 месяцев.\n",
    "Он много мурлыкал, был пушистый и добрый. В это время у нас уже жил пудель Мисти и он много на кот лаял и кусался.\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "color_words(lda, list(gensim.utils.tokenize(new_doc, to_lower=True)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Doc2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from gensim.models.doc2vec import TaggedDocument, Doc2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[TaggedDocument(words=['пушистый', 'котенок', 'мурлыкал'], tags=[0]),\n",
       " TaggedDocument(words=['пушистый', 'кот', 'мурлыкал', 'и', 'мяукал'], tags=[1]),\n",
       " TaggedDocument(words=['пушистый', 'котенок', 'мяукал'], tags=[2]),\n",
       " TaggedDocument(words=['громкий', 'пушистый', 'пудель', 'бегал', 'и', 'лаял', 'большой', 'пудель', 'лаял', 'и', 'кусался', 'громкий', 'большой', 'пудель', 'бегал'], tags=[3])]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_corpus =  [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_texts)]    \n",
    "train_corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:57:25,100 : WARNING : consider setting layer size to a multiple of 4 for greater performance\n",
      "2017-07-15 22:57:25,103 : INFO : collecting all words and their counts\n",
      "2017-07-15 22:57:25,104 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags\n",
      "2017-07-15 22:57:25,105 : INFO : collected 12 word types and 4 unique tags from a corpus of 4 examples and 26 words\n",
      "2017-07-15 22:57:25,106 : INFO : Loading a fresh vocabulary\n",
      "2017-07-15 22:57:25,107 : INFO : min_count=1 retains 12 unique words (100% of original 12, drops 0)\n",
      "2017-07-15 22:57:25,108 : INFO : min_count=1 leaves 26 word corpus (100% of original 26, drops 0)\n",
      "2017-07-15 22:57:25,109 : INFO : deleting the raw counts dictionary of 12 items\n",
      "2017-07-15 22:57:25,110 : INFO : sample=0.001 downsamples 12 most-common words\n",
      "2017-07-15 22:57:25,111 : INFO : downsampling leaves estimated 3 word corpus (12.0% of prior 26)\n",
      "2017-07-15 22:57:25,112 : INFO : estimated required memory for 12 words and 3 dimensions: 6336 bytes\n",
      "2017-07-15 22:57:25,113 : INFO : resetting layer weights\n",
      "2017-07-15 22:57:25,114 : INFO : training model with 3 workers on 12 vocabulary and 3 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n",
      "2017-07-15 22:57:25,118 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2017-07-15 22:57:25,119 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2017-07-15 22:57:25,127 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2017-07-15 22:57:25,128 : INFO : training on 2600 raw words (713 effective words) took 0.0s, 65986 effective words/s\n",
      "2017-07-15 22:57:25,129 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n",
      "2017-07-15 22:57:25,130 : WARNING : supplied example count (400) did not equal expected count (300)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "713"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v = Doc2Vec(size=3, min_count=1)\n",
    "d2v.build_vocab(train_corpus)\n",
    "d2v.train(train_corpus, total_examples=3, epochs=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.15008539,  0.1174393 , -0.18301599], dtype=float32)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v.docvecs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(2, 0.9999999403953552),\n",
       " (1, 0.8357436656951904),\n",
       " (0, 0.35743752121925354),\n",
       " (3, 0.23696193099021912)]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v.docvecs.most_similar(positive=[d2v.docvecs[2]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 22:59:19,093 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('мяукал', 0.9649645090103149),\n",
       " ('большой', 0.8888713121414185),\n",
       " ('пушистый', 0.8822481632232666),\n",
       " ('кот', 0.6684380769729614),\n",
       " ('и', 0.5814030170440674),\n",
       " ('бегал', 0.046333640813827515),\n",
       " ('лаял', -0.005524665117263794),\n",
       " ('мурлыкал', -0.34848976135253906),\n",
       " ('кусался', -0.46476083993911743),\n",
       " ('пудель', -0.5001183748245239)]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v.most_similar('котенок')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('кусался', 0.9931982755661011),\n",
       " ('громкий', 0.8554766178131104),\n",
       " ('и', 0.3850504159927368),\n",
       " ('бегал', 0.32188186049461365),\n",
       " ('мурлыкал', 0.17445510625839233),\n",
       " ('лаял', -0.09391497075557709),\n",
       " ('пушистый', -0.15622930228710175),\n",
       " ('котенок', -0.5001183748245239),\n",
       " ('мяукал', -0.6953105926513672),\n",
       " ('большой', -0.8411158323287964)]"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v.most_similar('пудель')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Tensorboard viz\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-15 23:53:17,181 : INFO : storing 4x3 projection weights into doc_tensor.w2v\n"
     ]
    }
   ],
   "source": [
    "d2v.save_word2vec_format('doc_tensor.w2v', doctag_vec=True, word_vec=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-16 00:06:53,842 : INFO : storing 12x3 projection weights into word_tensor.w2v\n"
     ]
    }
   ],
   "source": [
    "d2v.save_word2vec_format('word_tensor.w2v', doctag_vec=False, word_vec=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/lev/Dropbox/raretech/online_courses/datacamp_topicmod/courses-topic-modeling-with-gensim/notebooks\r\n"
     ]
    }
   ],
   "source": [
    "# follow https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Tensorboard_visualizations.ipynb\n",
    "!python ../../gensim/scripts/word2vec2tensor.py -i word_tensor.w2v -o cats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See [tensorboard viz](http://projector.tensorflow.org/?config=https://gist.githubusercontent.com/tmylk/ce68f87365d4df2a5c6da4c375016f71/raw/31302d6a1c5edc9a7594871777c9353e3a113ad6/tb_config.json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "airpub",
   "language": "python",
   "name": "airpub"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }