reflash · February 8, 2019 14:06
diff --git a/topic-modelling-lda.ipynb b/topic-modelling-lda.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n",
      "Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n",
      "Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n",
      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n",
      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n",
      "Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n",
      "Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n",
      "Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n",
      "Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n",
      "Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n",
      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n",
      "Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n",
      "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n",
      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n",
      "Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: nltk in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.4)\n",
      "Requirement already satisfied: six in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from nltk) (1.12.0)\n",
      "Requirement already satisfied: singledispatch in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from nltk) (3.4.0.3)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.23.4)\n",
      "Requirement already satisfied: pytz>=2011k in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2018.9)\n",
      "Requirement already satisfied: python-dateutil>=2.5.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2.7.5)\n",
      "Requirement already satisfied: numpy>=1.9.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (1.16.0)\n",
      "Requirement already satisfied: six>=1.5 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 18.1, however version 19.0.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting stemmer\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  Could not find a version that satisfies the requirement stemmer (from versions: )\n",
      "No matching distribution found for stemmer\n",
      "You are using pip version 18.1, however version 19.0.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install gensim\n",
    "!{sys.executable} -m pip install nltk\n",
    "!{sys.executable} -m pip install pandas\n",
    "!{sys.executable} -m pip install stemmer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);\n",
    "data_text = data[['headline_text']]\n",
    "data_text['index'] = data_text.index\n",
    "documents = data_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1103663\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>headline_text</th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>aba decides against community broadcasting lic...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>act fire witnesses must be aware of defamation</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a g calls for infrastructure protection summit</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>air nz staff in aust strike for pay rise</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>air nz strike to affect australian travellers</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       headline_text  index\n",
       "0  aba decides against community broadcasting lic...      0\n",
       "1     act fire witnesses must be aware of defamation      1\n",
       "2     a g calls for infrastructure protection summit      2\n",
       "3           air nz staff in aust strike for pay rise      3\n",
       "4      air nz strike to affect australian travellers      4"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(len(documents))\n",
    "documents.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\Daniil_Ekzarian\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gensim\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.parsing.preprocessing import STOPWORDS\n",
    "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n",
    "from nltk.stem.porter import *\n",
    "import numpy as np\n",
    "np.random.seed(2018)\n",
    "import nltk\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lemmatize_stemming(text):\n",
    "    stemmer = nltk.stem.PorterStemmer()\n",
    "    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))\n",
    "def preprocess(text):\n",
    "    result = []\n",
    "    for token in gensim.utils.simple_preprocess(text):\n",
    "        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n",
    "            result.append(lemmatize_stemming(token))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original document: \n",
      "['rain', 'helps', 'dampen', 'bushfires']\n",
      "\n",
      "\n",
      " tokenized and lemmatized document: \n",
      "['rain', 'help', 'dampen', 'bushfir']\n"
     ]
    }
   ],
   "source": [
    "doc_sample = documents[documents['index'] == 4310].values[0][0]\n",
    "print('original document: ')\n",
    "words = []\n",
    "for word in doc_sample.split(' '):\n",
    "    words.append(word)\n",
    "print(words)\n",
    "print('\\n\\n tokenized and lemmatized document: ')\n",
    "print(preprocess(doc_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0               [decid, commun, broadcast, licenc]\n",
       "1                               [wit, awar, defam]\n",
       "2           [call, infrastructur, protect, summit]\n",
       "3                      [staff, aust, strike, rise]\n",
       "4             [strike, affect, australian, travel]\n",
       "5               [ambiti, olsson, win, tripl, jump]\n",
       "6           [antic, delight, record, break, barca]\n",
       "7    [aussi, qualifi, stosur, wast, memphi, match]\n",
       "8            [aust, address, secur, council, iraq]\n",
       "9                         [australia, lock, timet]\n",
       "Name: headline_text, dtype: object"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_docs = documents['headline_text'].map(preprocess)\n",
    "processed_docs[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = gensim.corpora.Dictionary(processed_docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(76, 1), (112, 1), (483, 1), (4021, 1)]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
    "bow_corpus[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Word 76 (\"bushfir\") appears 1 time.\n",
      "Word 112 (\"help\") appears 1 time.\n",
      "Word 483 (\"rain\") appears 1 time.\n",
      "Word 4021 (\"dampen\") appears 1 time.\n"
     ]
    }
   ],
   "source": [
    "bow_doc_4310 = bow_corpus[4310]\n",
    "for i in range(len(bow_doc_4310)):\n",
    "    print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n",
    "                                               dictionary[bow_doc_4310[i][0]], \n",
    "bow_doc_4310[i][1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.5903603121911333),\n",
      " (1, 0.3852450692300274),\n",
      " (2, 0.4974556050119205),\n",
      " (3, 0.505567858418396)]\n"
     ]
    }
   ],
   "source": [
    "from gensim import corpora, models\n",
    "tfidf = models.TfidfModel(bow_corpus)\n",
    "corpus_tfidf = tfidf[bow_corpus]\n",
    "from pprint import pprint\n",
    "for doc in corpus_tfidf:\n",
    "    pprint(doc)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 \n",
      "Words: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n",
      "Topic: 1 \n",
      "Words: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n",
      "Topic: 2 \n",
      "Words: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n",
      "Topic: 3 \n",
      "Words: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n",
      "Topic: 4 \n",
      "Words: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n",
      "Topic: 5 \n",
      "Words: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n",
      "Topic: 6 \n",
      "Words: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n",
      "Topic: 7 \n",
      "Words: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n",
      "Topic: 8 \n",
      "Words: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n",
      "Topic: 9 \n",
      "Words: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n"
     ]
    }
   ],
   "source": [
    "for idx, topic in lda_model.print_topics(-1):\n",
    "    print('Topic: {} \\nWords: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 Word: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n",
      "Topic: 1 Word: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n",
      "Topic: 2 Word: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n",
      "Topic: 3 Word: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n",
      "Topic: 4 Word: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n",
      "Topic: 5 Word: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n",
      "Topic: 6 Word: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n",
      "Topic: 7 Word: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n",
      "Topic: 8 Word: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n",
      "Topic: 9 Word: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n"
     ]
    }
   ],
   "source": [
    "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n",
    "for idx, topic in lda_model_tfidf.print_topics(-1):\n",
    "    print('Topic: {} Word: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['rain', 'help', 'dampen', 'bushfir']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_docs[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Score: 0.420369952917099\t \n",
      "Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n",
      "\n",
      "Score: 0.22018268704414368\t \n",
      "Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n",
      "\n",
      "Score: 0.21931657195091248\t \n",
      "Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n",
      "\n",
      "Score: 0.020019859075546265\t \n",
      "Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n",
      "\n",
      "Score: 0.020018484443426132\t \n",
      "Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Score: 0.8199620842933655\t \n",
      "Topic: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n",
      "\n",
      "Score: 0.02000523917376995\t \n",
      "Topic: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n",
      "\n",
      "Score: 0.020004820078611374\t \n",
      "Topic: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n",
      "\n",
      "Score: 0.020004622638225555\t \n",
      "Topic: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n",
      "\n",
      "Score: 0.02000461332499981\t \n",
      "Topic: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n",
      "\n",
      "Score: 0.02000422589480877\t \n",
      "Topic: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n",
      "\n",
      "Score: 0.020003804937005043\t \n",
      "Topic: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n",
      "\n",
      "Score: 0.020003732293844223\t \n",
      "Topic: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n",
      "\n",
      "Score: 0.020003456622362137\t \n",
      "Topic: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n",
      "\n",
      "Score: 0.020003436133265495\t \n",
      "Topic: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.34995949268341064\t Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\"\n",
      "Score: 0.18341293931007385\t Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\"\n",
      "Score: 0.18331679701805115\t Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\"\n",
      "Score: 0.18324868381023407\t Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\"\n",
      "Score: 0.01667937822639942\t Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\"\n",
      "Score: 0.016678614541888237\t Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\"\n",
      "Score: 0.016676034778356552\t Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\"\n",
      "Score: 0.016676034778356552\t Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\"\n",
      "Score: 0.016676034778356552\t Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\"\n",
      "Score: 0.016676034778356552\t Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\"\n"
     ]
    }
   ],
   "source": [
    "unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n",
    "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n",
    "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }