Skip to content

Instantly share code, notes, and snippets.

@reflash
Created February 8, 2019 14:06
Show Gist options
  • Save reflash/5dfd99b65f919aad9dfc00946299b53f to your computer and use it in GitHub Desktop.
Save reflash/5dfd99b65f919aad9dfc00946299b53f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n",
"Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n",
"Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n",
"Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n",
"Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n",
"Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n",
"Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n",
"Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n",
"Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n",
"Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n",
"Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n",
"Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n",
"Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n",
"Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n",
"Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nltk in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.4)\n",
"Requirement already satisfied: six in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from nltk) (1.12.0)\n",
"Requirement already satisfied: singledispatch in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from nltk) (3.4.0.3)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.23.4)\n",
"Requirement already satisfied: pytz>=2011k in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2018.9)\n",
"Requirement already satisfied: python-dateutil>=2.5.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2.7.5)\n",
"Requirement already satisfied: numpy>=1.9.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (1.16.0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting stemmer\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" Could not find a version that satisfies the requirement stemmer (from versions: )\n",
"No matching distribution found for stemmer\n",
"You are using pip version 18.1, however version 19.0.1 is available.\n",
"You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install gensim\n",
"!{sys.executable} -m pip install nltk\n",
"!{sys.executable} -m pip install pandas\n",
"!{sys.executable} -m pip install stemmer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);\n",
"data_text = data[['headline_text']]\n",
"data_text['index'] = data_text.index\n",
"documents = data_text"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1103663\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>headline_text</th>\n",
" <th>index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>aba decides against community broadcasting lic...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>act fire witnesses must be aware of defamation</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a g calls for infrastructure protection summit</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>air nz staff in aust strike for pay rise</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>air nz strike to affect australian travellers</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" headline_text index\n",
"0 aba decides against community broadcasting lic... 0\n",
"1 act fire witnesses must be aware of defamation 1\n",
"2 a g calls for infrastructure protection summit 2\n",
"3 air nz staff in aust strike for pay rise 3\n",
"4 air nz strike to affect australian travellers 4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len(documents))\n",
"documents.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] C:\\Users\\Daniil_Ekzarian\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import gensim\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.parsing.preprocessing import STOPWORDS\n",
"from nltk.stem import WordNetLemmatizer, SnowballStemmer\n",
"from nltk.stem.porter import *\n",
"import numpy as np\n",
"np.random.seed(2018)\n",
"import nltk\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def lemmatize_stemming(text):\n",
" stemmer = nltk.stem.PorterStemmer()\n",
" return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))\n",
"def preprocess(text):\n",
" result = []\n",
" for token in gensim.utils.simple_preprocess(text):\n",
" if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n",
" result.append(lemmatize_stemming(token))\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"original document: \n",
"['rain', 'helps', 'dampen', 'bushfires']\n",
"\n",
"\n",
" tokenized and lemmatized document: \n",
"['rain', 'help', 'dampen', 'bushfir']\n"
]
}
],
"source": [
"doc_sample = documents[documents['index'] == 4310].values[0][0]\n",
"print('original document: ')\n",
"words = []\n",
"for word in doc_sample.split(' '):\n",
" words.append(word)\n",
"print(words)\n",
"print('\\n\\n tokenized and lemmatized document: ')\n",
"print(preprocess(doc_sample))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [decid, commun, broadcast, licenc]\n",
"1 [wit, awar, defam]\n",
"2 [call, infrastructur, protect, summit]\n",
"3 [staff, aust, strike, rise]\n",
"4 [strike, affect, australian, travel]\n",
"5 [ambiti, olsson, win, tripl, jump]\n",
"6 [antic, delight, record, break, barca]\n",
"7 [aussi, qualifi, stosur, wast, memphi, match]\n",
"8 [aust, address, secur, council, iraq]\n",
"9 [australia, lock, timet]\n",
"Name: headline_text, dtype: object"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_docs = documents['headline_text'].map(preprocess)\n",
"processed_docs[:10]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"dictionary = gensim.corpora.Dictionary(processed_docs)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(76, 1), (112, 1), (483, 1), (4021, 1)]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
"bow_corpus[4310]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Word 76 (\"bushfir\") appears 1 time.\n",
"Word 112 (\"help\") appears 1 time.\n",
"Word 483 (\"rain\") appears 1 time.\n",
"Word 4021 (\"dampen\") appears 1 time.\n"
]
}
],
"source": [
"bow_doc_4310 = bow_corpus[4310]\n",
"for i in range(len(bow_doc_4310)):\n",
" print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n",
" dictionary[bow_doc_4310[i][0]], \n",
"bow_doc_4310[i][1]))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.5903603121911333),\n",
" (1, 0.3852450692300274),\n",
" (2, 0.4974556050119205),\n",
" (3, 0.505567858418396)]\n"
]
}
],
"source": [
"from gensim import corpora, models\n",
"tfidf = models.TfidfModel(bow_corpus)\n",
"corpus_tfidf = tfidf[bow_corpus]\n",
"from pprint import pprint\n",
"for doc in corpus_tfidf:\n",
" pprint(doc)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic: 0 \n",
"Words: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n",
"Topic: 1 \n",
"Words: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n",
"Topic: 2 \n",
"Words: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n",
"Topic: 3 \n",
"Words: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n",
"Topic: 4 \n",
"Words: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n",
"Topic: 5 \n",
"Words: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n",
"Topic: 6 \n",
"Words: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n",
"Topic: 7 \n",
"Words: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n",
"Topic: 8 \n",
"Words: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n",
"Topic: 9 \n",
"Words: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n"
]
}
],
"source": [
"for idx, topic in lda_model.print_topics(-1):\n",
" print('Topic: {} \\nWords: {}'.format(idx, topic))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Topic: 0 Word: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n",
"Topic: 1 Word: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n",
"Topic: 2 Word: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n",
"Topic: 3 Word: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n",
"Topic: 4 Word: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n",
"Topic: 5 Word: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n",
"Topic: 6 Word: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n",
"Topic: 7 Word: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n",
"Topic: 8 Word: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n",
"Topic: 9 Word: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n"
]
}
],
"source": [
"lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n",
"for idx, topic in lda_model_tfidf.print_topics(-1):\n",
" print('Topic: {} Word: {}'.format(idx, topic))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['rain', 'help', 'dampen', 'bushfir']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_docs[4310]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Score: 0.420369952917099\t \n",
"Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n",
"\n",
"Score: 0.22018268704414368\t \n",
"Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n",
"\n",
"Score: 0.21931657195091248\t \n",
"Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n",
"\n",
"Score: 0.020019859075546265\t \n",
"Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n",
"\n",
"Score: 0.020018484443426132\t \n",
"Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n"
]
}
],
"source": [
"for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
" print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Score: 0.8199620842933655\t \n",
"Topic: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n",
"\n",
"Score: 0.02000523917376995\t \n",
"Topic: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n",
"\n",
"Score: 0.020004820078611374\t \n",
"Topic: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n",
"\n",
"Score: 0.020004622638225555\t \n",
"Topic: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n",
"\n",
"Score: 0.02000461332499981\t \n",
"Topic: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n",
"\n",
"Score: 0.02000422589480877\t \n",
"Topic: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n",
"\n",
"Score: 0.020003804937005043\t \n",
"Topic: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n",
"\n",
"Score: 0.020003732293844223\t \n",
"Topic: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n",
"\n",
"Score: 0.020003456622362137\t \n",
"Topic: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n",
"\n",
"Score: 0.020003436133265495\t \n",
"Topic: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n"
]
}
],
"source": [
"for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
" print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Score: 0.34995949268341064\t Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\"\n",
"Score: 0.18341293931007385\t Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\"\n",
"Score: 0.18331679701805115\t Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\"\n",
"Score: 0.18324868381023407\t Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\"\n",
"Score: 0.01667937822639942\t Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\"\n",
"Score: 0.016678614541888237\t Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\"\n",
"Score: 0.016676034778356552\t Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\"\n",
"Score: 0.016676034778356552\t Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\"\n",
"Score: 0.016676034778356552\t Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\"\n",
"Score: 0.016676034778356552\t Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\"\n"
]
}
],
"source": [
"unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n",
"bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n",
"for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
" print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment