Created
February 8, 2019 14:06
-
-
Save reflash/5dfd99b65f919aad9dfc00946299b53f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: gensim in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.7.1)\n", | |
| "Requirement already satisfied: smart-open>=1.7.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.8.0)\n", | |
| "Requirement already satisfied: six>=1.5.0 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from gensim) (1.12.0)\n", | |
| "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.16.0)\n", | |
| "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from gensim) (1.2.0)\n", | |
| "Requirement already satisfied: boto>=2.32 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.49.0)\n", | |
| "Requirement already satisfied: bz2file in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (0.98)\n", | |
| "Requirement already satisfied: requests in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (2.21.0)\n", | |
| "Requirement already satisfied: boto3 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from smart-open>=1.7.0->gensim) (1.9.90)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2018.11.29)\n", | |
| "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (3.0.4)\n", | |
| "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (2.8)\n", | |
| "Requirement already satisfied: urllib3<1.25,>=1.21.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from requests->smart-open>=1.7.0->gensim) (1.24.1)\n", | |
| "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.9.3)\n", | |
| "Requirement already satisfied: botocore<1.13.0,>=1.12.90 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (1.12.90)\n", | |
| "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from boto3->smart-open>=1.7.0->gensim) (0.2.0)\n", | |
| "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (2.7.5)\n", | |
| "Requirement already satisfied: docutils>=0.10 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from botocore<1.13.0,>=1.12.90->boto3->smart-open>=1.7.0->gensim) (0.14)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.1 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: nltk in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (3.4)\n", | |
| "Requirement already satisfied: six in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from nltk) (1.12.0)\n", | |
| "Requirement already satisfied: singledispatch in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from nltk) (3.4.0.3)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.1 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: pandas in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (0.23.4)\n", | |
| "Requirement already satisfied: pytz>=2011k in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2018.9)\n", | |
| "Requirement already satisfied: python-dateutil>=2.5.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (2.7.5)\n", | |
| "Requirement already satisfied: numpy>=1.9.0 in c:\\users\\daniil_ekzarian\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages (from pandas) (1.16.0)\n", | |
| "Requirement already satisfied: six>=1.5 in c:\\users\\daniil_ekzarian\\appdata\\roaming\\python\\python37\\site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "You are using pip version 18.1, however version 19.0.1 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Collecting stemmer\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " Could not find a version that satisfies the requirement stemmer (from versions: )\n", | |
| "No matching distribution found for stemmer\n", | |
| "You are using pip version 18.1, however version 19.0.1 is available.\n", | |
| "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import sys\n", | |
| "!{sys.executable} -m pip install gensim\n", | |
| "!{sys.executable} -m pip install nltk\n", | |
| "!{sys.executable} -m pip install pandas\n", | |
| "!{sys.executable} -m pip install stemmer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);\n", | |
| "data_text = data[['headline_text']]\n", | |
| "data_text['index'] = data_text.index\n", | |
| "documents = data_text" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "1103663\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>headline_text</th>\n", | |
| " <th>index</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>aba decides against community broadcasting lic...</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>act fire witnesses must be aware of defamation</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>a g calls for infrastructure protection summit</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>air nz staff in aust strike for pay rise</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>air nz strike to affect australian travellers</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " headline_text index\n", | |
| "0 aba decides against community broadcasting lic... 0\n", | |
| "1 act fire witnesses must be aware of defamation 1\n", | |
| "2 a g calls for infrastructure protection summit 2\n", | |
| "3 air nz staff in aust strike for pay rise 3\n", | |
| "4 air nz strike to affect australian travellers 4" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "print(len(documents))\n", | |
| "documents.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[nltk_data] Downloading package wordnet to\n", | |
| "[nltk_data] C:\\Users\\Daniil_Ekzarian\\AppData\\Roaming\\nltk_data...\n", | |
| "[nltk_data] Package wordnet is already up-to-date!\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import gensim\n", | |
| "from gensim.utils import simple_preprocess\n", | |
| "from gensim.parsing.preprocessing import STOPWORDS\n", | |
| "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n", | |
| "from nltk.stem.porter import *\n", | |
| "import numpy as np\n", | |
| "np.random.seed(2018)\n", | |
| "import nltk\n", | |
| "nltk.download('wordnet')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def lemmatize_stemming(text):\n", | |
| " stemmer = nltk.stem.PorterStemmer()\n", | |
| " return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))\n", | |
| "def preprocess(text):\n", | |
| " result = []\n", | |
| " for token in gensim.utils.simple_preprocess(text):\n", | |
| " if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n", | |
| " result.append(lemmatize_stemming(token))\n", | |
| " return result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "original document: \n", | |
| "['rain', 'helps', 'dampen', 'bushfires']\n", | |
| "\n", | |
| "\n", | |
| " tokenized and lemmatized document: \n", | |
| "['rain', 'help', 'dampen', 'bushfir']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "doc_sample = documents[documents['index'] == 4310].values[0][0]\n", | |
| "print('original document: ')\n", | |
| "words = []\n", | |
| "for word in doc_sample.split(' '):\n", | |
| " words.append(word)\n", | |
| "print(words)\n", | |
| "print('\\n\\n tokenized and lemmatized document: ')\n", | |
| "print(preprocess(doc_sample))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 [decid, commun, broadcast, licenc]\n", | |
| "1 [wit, awar, defam]\n", | |
| "2 [call, infrastructur, protect, summit]\n", | |
| "3 [staff, aust, strike, rise]\n", | |
| "4 [strike, affect, australian, travel]\n", | |
| "5 [ambiti, olsson, win, tripl, jump]\n", | |
| "6 [antic, delight, record, break, barca]\n", | |
| "7 [aussi, qualifi, stosur, wast, memphi, match]\n", | |
| "8 [aust, address, secur, council, iraq]\n", | |
| "9 [australia, lock, timet]\n", | |
| "Name: headline_text, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "processed_docs = documents['headline_text'].map(preprocess)\n", | |
| "processed_docs[:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dictionary = gensim.corpora.Dictionary(processed_docs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(76, 1), (112, 1), (483, 1), (4021, 1)]" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", | |
| "bow_corpus[4310]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Word 76 (\"bushfir\") appears 1 time.\n", | |
| "Word 112 (\"help\") appears 1 time.\n", | |
| "Word 483 (\"rain\") appears 1 time.\n", | |
| "Word 4021 (\"dampen\") appears 1 time.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "bow_doc_4310 = bow_corpus[4310]\n", | |
| "for i in range(len(bow_doc_4310)):\n", | |
| " print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n", | |
| " dictionary[bow_doc_4310[i][0]], \n", | |
| "bow_doc_4310[i][1]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[(0, 0.5903603121911333),\n", | |
| " (1, 0.3852450692300274),\n", | |
| " (2, 0.4974556050119205),\n", | |
| " (3, 0.505567858418396)]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from gensim import corpora, models\n", | |
| "tfidf = models.TfidfModel(bow_corpus)\n", | |
| "corpus_tfidf = tfidf[bow_corpus]\n", | |
| "from pprint import pprint\n", | |
| "for doc in corpus_tfidf:\n", | |
| " pprint(doc)\n", | |
| " break" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Topic: 0 \n", | |
| "Words: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n", | |
| "Topic: 1 \n", | |
| "Words: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n", | |
| "Topic: 2 \n", | |
| "Words: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n", | |
| "Topic: 3 \n", | |
| "Words: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n", | |
| "Topic: 4 \n", | |
| "Words: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n", | |
| "Topic: 5 \n", | |
| "Words: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n", | |
| "Topic: 6 \n", | |
| "Words: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n", | |
| "Topic: 7 \n", | |
| "Words: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n", | |
| "Topic: 8 \n", | |
| "Words: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n", | |
| "Topic: 9 \n", | |
| "Words: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for idx, topic in lda_model.print_topics(-1):\n", | |
| " print('Topic: {} \\nWords: {}'.format(idx, topic))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Topic: 0 Word: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n", | |
| "Topic: 1 Word: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n", | |
| "Topic: 2 Word: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n", | |
| "Topic: 3 Word: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n", | |
| "Topic: 4 Word: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n", | |
| "Topic: 5 Word: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n", | |
| "Topic: 6 Word: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n", | |
| "Topic: 7 Word: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n", | |
| "Topic: 8 Word: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n", | |
| "Topic: 9 Word: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\n", | |
| "for idx, topic in lda_model_tfidf.print_topics(-1):\n", | |
| " print('Topic: {} Word: {}'.format(idx, topic))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['rain', 'help', 'dampen', 'bushfir']" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "processed_docs[4310]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "Score: 0.420369952917099\t \n", | |
| "Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\" + 0.017*\"elect\" + 0.014*\"help\" + 0.014*\"west\" + 0.011*\"victoria\" + 0.011*\"flood\"\n", | |
| "\n", | |
| "Score: 0.22018268704414368\t \n", | |
| "Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\" + 0.012*\"john\" + 0.012*\"announc\" + 0.010*\"unit\" + 0.009*\"beat\" + 0.009*\"celebr\"\n", | |
| "\n", | |
| "Score: 0.21931657195091248\t \n", | |
| "Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\" + 0.011*\"vote\" + 0.010*\"game\" + 0.010*\"port\" + 0.009*\"refuge\" + 0.009*\"save\"\n", | |
| "\n", | |
| "Score: 0.020019859075546265\t \n", | |
| "Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\" + 0.014*\"abus\" + 0.014*\"busi\" + 0.013*\"guilti\" + 0.012*\"farm\" + 0.011*\"find\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\" + 0.013*\"australia\" + 0.013*\"driver\" + 0.011*\"prison\" + 0.011*\"releas\" + 0.011*\"china\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\" + 0.013*\"live\" + 0.013*\"interview\" + 0.012*\"council\" + 0.011*\"call\" + 0.010*\"rural\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\" + 0.018*\"die\" + 0.016*\"shoot\" + 0.015*\"rise\" + 0.015*\"woman\" + 0.014*\"coast\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\" + 0.017*\"accus\" + 0.016*\"child\" + 0.015*\"alleg\" + 0.014*\"peopl\" + 0.014*\"trial\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\" + 0.014*\"break\" + 0.014*\"life\" + 0.013*\"take\" + 0.013*\"student\" + 0.013*\"protest\"\n", | |
| "\n", | |
| "Score: 0.020018484443426132\t \n", | |
| "Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\" + 0.012*\"say\" + 0.011*\"polit\" + 0.010*\"elect\" + 0.010*\"hobart\" + 0.010*\"violenc\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n", | |
| " print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "Score: 0.8199620842933655\t \n", | |
| "Topic: 0.014*\"rural\" + 0.013*\"news\" + 0.010*\"coast\" + 0.009*\"weather\" + 0.007*\"gold\" + 0.007*\"search\" + 0.006*\"nation\" + 0.006*\"rugbi\" + 0.006*\"mental\" + 0.006*\"drum\"\n", | |
| "\n", | |
| "Score: 0.02000523917376995\t \n", | |
| "Topic: 0.011*\"queensland\" + 0.009*\"west\" + 0.008*\"sport\" + 0.008*\"john\" + 0.007*\"david\" + 0.007*\"friday\" + 0.007*\"farm\" + 0.006*\"wednesday\" + 0.006*\"north\" + 0.006*\"decemb\"\n", | |
| "\n", | |
| "Score: 0.020004820078611374\t \n", | |
| "Topic: 0.011*\"guilti\" + 0.009*\"abus\" + 0.009*\"child\" + 0.008*\"plead\" + 0.008*\"climat\" + 0.008*\"octob\" + 0.007*\"peter\" + 0.006*\"toni\" + 0.006*\"mount\" + 0.006*\"capit\"\n", | |
| "\n", | |
| "Score: 0.020004622638225555\t \n", | |
| "Topic: 0.008*\"care\" + 0.007*\"island\" + 0.006*\"fish\" + 0.006*\"nurs\" + 0.006*\"product\" + 0.005*\"grain\" + 0.005*\"harvest\" + 0.005*\"age\" + 0.005*\"townsvil\" + 0.005*\"allegedli\"\n", | |
| "\n", | |
| "Score: 0.02000461332499981\t \n", | |
| "Topic: 0.015*\"market\" + 0.011*\"share\" + 0.010*\"price\" + 0.010*\"rise\" + 0.009*\"turnbul\" + 0.008*\"grandstand\" + 0.007*\"fall\" + 0.007*\"australian\" + 0.007*\"dollar\" + 0.006*\"michael\"\n", | |
| "\n", | |
| "Score: 0.02000422589480877\t \n", | |
| "Topic: 0.025*\"countri\" + 0.023*\"hour\" + 0.022*\"trump\" + 0.012*\"podcast\" + 0.011*\"donald\" + 0.008*\"commiss\" + 0.008*\"royal\" + 0.007*\"christma\" + 0.006*\"hunter\" + 0.005*\"rail\"\n", | |
| "\n", | |
| "Score: 0.020003804937005043\t \n", | |
| "Topic: 0.013*\"elect\" + 0.013*\"govern\" + 0.008*\"labor\" + 0.007*\"senat\" + 0.006*\"say\" + 0.006*\"liber\" + 0.006*\"marriag\" + 0.006*\"feder\" + 0.006*\"juli\" + 0.005*\"septemb\"\n", | |
| "\n", | |
| "Score: 0.020003732293844223\t \n", | |
| "Topic: 0.017*\"polic\" + 0.017*\"charg\" + 0.014*\"murder\" + 0.012*\"court\" + 0.011*\"woman\" + 0.011*\"crash\" + 0.010*\"shoot\" + 0.010*\"kill\" + 0.010*\"death\" + 0.009*\"jail\"\n", | |
| "\n", | |
| "Score: 0.020003456622362137\t \n", | |
| "Topic: 0.015*\"interview\" + 0.009*\"australia\" + 0.008*\"leagu\" + 0.007*\"world\" + 0.007*\"final\" + 0.007*\"monday\" + 0.006*\"open\" + 0.006*\"thursday\" + 0.006*\"smith\" + 0.005*\"zealand\"\n", | |
| "\n", | |
| "Score: 0.020003436133265495\t \n", | |
| "Topic: 0.009*\"abbott\" + 0.008*\"korea\" + 0.007*\"march\" + 0.006*\"polit\" + 0.006*\"syria\" + 0.006*\"islam\" + 0.006*\"say\" + 0.006*\"north\" + 0.006*\"russia\" + 0.006*\"protest\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n", | |
| " print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Score: 0.34995949268341064\t Topic: 0.021*\"test\" + 0.018*\"world\" + 0.018*\"hospit\" + 0.017*\"hour\" + 0.015*\"leav\"\n", | |
| "Score: 0.18341293931007385\t Topic: 0.016*\"donald\" + 0.015*\"adelaid\" + 0.014*\"turnbul\" + 0.012*\"feder\" + 0.011*\"centr\"\n", | |
| "Score: 0.18331679701805115\t Topic: 0.021*\"market\" + 0.019*\"women\" + 0.017*\"tasmania\" + 0.015*\"open\" + 0.015*\"share\"\n", | |
| "Score: 0.18324868381023407\t Topic: 0.034*\"court\" + 0.026*\"murder\" + 0.024*\"charg\" + 0.022*\"face\" + 0.018*\"miss\"\n", | |
| "Score: 0.01667937822639942\t Topic: 0.025*\"govern\" + 0.018*\"say\" + 0.016*\"chang\" + 0.015*\"nation\" + 0.014*\"countri\"\n", | |
| "Score: 0.016678614541888237\t Topic: 0.046*\"polic\" + 0.026*\"death\" + 0.026*\"attack\" + 0.023*\"kill\" + 0.021*\"crash\"\n", | |
| "Score: 0.016676034778356552\t Topic: 0.021*\"final\" + 0.019*\"win\" + 0.016*\"leagu\" + 0.015*\"lose\" + 0.014*\"citi\"\n", | |
| "Score: 0.016676034778356552\t Topic: 0.027*\"report\" + 0.020*\"famili\" + 0.017*\"water\" + 0.016*\"time\" + 0.015*\"concern\"\n", | |
| "Score: 0.016676034778356552\t Topic: 0.042*\"australian\" + 0.026*\"queensland\" + 0.023*\"australia\" + 0.022*\"south\" + 0.020*\"north\"\n", | |
| "Score: 0.016676034778356552\t Topic: 0.037*\"trump\" + 0.037*\"year\" + 0.023*\"hous\" + 0.021*\"canberra\" + 0.012*\"bank\"\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n", | |
| "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n", | |
| "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n", | |
| " print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment