Last active
March 15, 2017 04:26
-
-
Save tandon-aman/4e101aff1e65b91007433bdd3daf0c48 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import facebook\n", | |
"import json\n", | |
"import requests\n", | |
"import uuid" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Setting the access token to use the facebook api " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 203, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ACCESS_TOKEN='EAACEdEose0cBALpApC3y6GwYxn92f2lMGr2VJen2LXaABjgCfVP1wpXA8nq8gjDTBudOlWGlvX2ai6qctZCDtwJ3ZBPE82fnAZAlzHvL1JFJkbcaVgbPCZCu2apVEHUzjL5qIUSeEZAm7u3ShWNqo3YTKwllJP6b9CG6RP2ptilboankAb7Ao4Y9p65czyLdjYZD'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 204, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"g = facebook.GraphAPI(ACCESS_TOKEN)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Verifying the facebook api " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"g.get_object('me')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 194, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def pp(o):\n", | |
" print json.dumps(o, indent=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"# Setting the Facebook Page Id e.g. BBC India " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 211, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"page_id= 'bbcindia'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"# retrieve the BBCINDIA Facebook Page Posts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"page = g.get_object(page_id+'/posts?fields=message', page=True, retry=5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false | |
}, | |
"source": [ | |
"# Providing the number of pages to search search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 212, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"page_count = 1\n", | |
"pages_to_search = 250" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"source": [ | |
"\n", | |
"# We will be firstly getting the facebook posts and then their related comments\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"source": [ | |
"# Function to get the posts\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 213, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def retrieveFacebookPagePosts(pages_to_search):\n", | |
" posts_data = []\n", | |
" while page_count < pages_to_search:\n", | |
" try:\n", | |
" for message in page['data']:\n", | |
" if 'message' in message:\n", | |
" post_data = {}\n", | |
" commentdata = {}\n", | |
" post_data['text'] = message['message']\n", | |
" post_data['id'] = str(uuid.uuid4())\n", | |
" commentdata['id'] = message['id']\n", | |
" post_data['comments'] = commentdata\n", | |
" post_data['time'] = message['created_time']\n", | |
" posts_data.append(post_data)\n", | |
" \n", | |
" page_count = page_count + 1\n", | |
" #page = requests.get(page['paging']['next']).json()\n", | |
" if 'paging' in page and 'next' in page['paging']:\n", | |
" page = requests.get(page['paging']['next']).json()\n", | |
" else:\n", | |
" print \"Page count reached \"+str(page_count)\n", | |
" print 'going to break'\n", | |
" break\n", | |
" except KeyError:\n", | |
" print \"Exception occured\"\n", | |
" return posts_data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\n", | |
"# Function to retrieve the comments of posts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 214, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def getPostComments(comment_id):\n", | |
" comments = []\n", | |
" comment_data = g.get_object(comment_id)\n", | |
" comment_page_count = 1\n", | |
" \n", | |
" try:\n", | |
" while comment_page_count < 2 :\n", | |
" if 'comments' in comment_data:\n", | |
" for comment in comment_data['comments']['data']:\n", | |
" comments.append(comment['message'])\n", | |
"\n", | |
" comment_page_count = comment_page_count + 1\n", | |
" if 'comments' in comment_data and 'paging' in comment_data['comments'] and 'next' in comment_data['comments']['paging']:\n", | |
" comment_data = requests.get(comment_data['comments']['paging']['next']).json()\n", | |
" else:\n", | |
" break\n", | |
" except:\n", | |
" print \"exception occured\"\n", | |
" print comment_page_count\n", | |
" \n", | |
" return comments" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Fetching the Posts\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 161, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Page count reached 119\n", | |
"going to break\n" | |
] | |
} | |
], | |
"source": [ | |
"posts_data = retrieveFacebookPagePosts(pages_to_search)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"os.system('say \"your program has finished\"')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Inserting the Comments into the posts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 179, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"text\": \"Disney is calling it the first ever \\\"exclusively gay moment\\\" on film.\", \n", | |
" \"id\": \"125c50ca-3da8-4c1e-aac2-acdf61120a86\", \n", | |
" \"comments\": {\n", | |
" \"data\": [], \n", | |
" \"id\": \"151955124848859_1442089169168775\"\n", | |
" }, \n", | |
" \"time\": \"2017-03-02T10:15:00+0000\"\n", | |
"}\n", | |
"{\n", | |
" \"text\": \"Sleeping too much or even too little can be very harmful.\", \n", | |
" \"id\": \"022e62d8-2ce7-44e8-b375-da7f2f44186a\", \n", | |
" \"comments\": {\n", | |
" \"data\": [\n", | |
" \"Nisha Mehul Teli\"\n", | |
" ], \n", | |
" \"id\": \"151955124848859_1442087229168969\"\n", | |
" }, \n", | |
" \"time\": \"2017-03-02T09:10:00+0000\"\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"for post in posts_data:\n", | |
" post['comments']['data'] = getPostComments(post['comments']['id'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 217, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2877\n", | |
"True\n" | |
] | |
} | |
], | |
"source": [ | |
"import pickle\n", | |
"\n", | |
"fw = open('fb_post_data','wb')\n", | |
"pickle.dump(posts_data,fw)\n", | |
"\n", | |
"fr = open('fb_post_data','rb')\n", | |
"pickled_posts = pickle.load(fr)\n", | |
"\n", | |
"print len(pickled_posts)\n", | |
"\n", | |
"print posts_data == pickled_posts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Keeping only English comments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 265, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import langdetect" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 289, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def retriveEnglistCommentsOnly(comments):\n", | |
" english_comments = []\n", | |
" for comment in comments:\n", | |
" try:\n", | |
" if langdetect.detect(comment) == 'en':\n", | |
" english_comments.append(comment+'.')\n", | |
" except:\n", | |
" english_comments.append(comment+'.')\n", | |
" return english_comments" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Text Processing Before inserting into the mongodb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 290, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%timeit\n", | |
"#adding text key which will contains the whole post and comment data\n", | |
"for post in posts_data:\n", | |
" post['text_comment'] = post['text']+u' '+ u' '.join(retriveEnglistCommentsOnly(post['comments']['data']))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 505, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u\"Ms Conway is seen kneeling on the sofa and clutching her phone as US President Donald Trump poses with leaders of historically black colleges and universities. Belonging to The regime which came in and started demeaning the federal offices and system in itself, this isn't a big surprise.... Everybody should keep their dirty feet off the furniture. Ugh.. Did you grow up in a barn?.. People have to sit there.. This is being shared to me aprox 20 time daily I do not like it & I will not give it a like. I thought white house had an official photographer.\\U0001f609. It is disgusting. No respect for 'White House'.. Pinned her up alongside Melania.\"" | |
] | |
}, | |
"execution_count": 505, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"posts_data[19]['text_comment']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 294, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from spacy.en import English\n", | |
"import codecs\n", | |
"import re\n", | |
"import os\n", | |
"import unidecode" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 295, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"parser = English()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 296, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fbpost_filepath = os.path.join('fbpagepost_all.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with codecs.open(fbpost_filepath, 'w', encoding='utf_8') as f:\n", | |
" for fbpost in posts_data:\n", | |
" \n", | |
" post = unidecode.unidecode(fbpost['text_comment'])\n", | |
" print \"Orig: \"+post\n", | |
" #removing out the Repost word, @abc, url and hash from hashtag\n", | |
" post = re.sub(r\"http\\S+\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"#\", \" \", post,flags=re.IGNORECASE)#|RT|@\\S+|-|'|%|\\'|\\(|\\)|;|:| |'s|’|\\\"|&|’s|\\U000\\S+\n", | |
" post = re.sub(r\"RT\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"@\\S+\", \" \", post,flags=re.IGNORECASE)\n", | |
" #for removing the >\n", | |
" post = re.sub(r\"&\\S+\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"-\", \"\",post)\n", | |
" post = re.sub(r\" \", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"%\", \" \",post)\n", | |
" post = re.sub(r\"'s\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"\\'\", \"\",post)\n", | |
" post = re.sub(r\"\\\\\", \"\",post)\n", | |
" post = re.sub(r\"/\", \"\",post)\n", | |
" post = re.sub(r\"\\(\", \" \",post)\n", | |
" post = re.sub(r\"\\)\", \" \",post)\n", | |
" post = re.sub(r\";\", \" \",post)\n", | |
" post = re.sub(r\":\", \" \",post)\n", | |
" post = re.sub(r\"[.]{2,}\",\" \", post)\n", | |
" post = re.sub(r\"\\\"\", \" \",post)\n", | |
" post = re.sub(r\"<\", \" \",post)\n", | |
" post = re.sub(r\">\", \" \",post)\n", | |
" post = re.sub(r\"\\[\", \" \",post)\n", | |
" post = re.sub(r\"\\]\", \" \",post)\n", | |
" post = re.sub(r\"\\{\", \" \",post)\n", | |
" post = re.sub(r\"\\}\", \" \",post)\n", | |
" post = re.sub(r\"\\|\", \" \",post)\n", | |
" post = re.sub(r\"=\", \" \",post)\n", | |
" post = re.sub(r\"~\", \" \",post)\n", | |
" post = re.sub(r\"\\`\", \" \",post)\n", | |
" post = re.sub(r\"\\^\", \" \",post)\n", | |
" post = re.sub(r\"\\+\", \" \",post)\n", | |
" post = re.sub(r\"!\", \" \",post)\n", | |
" post = re.sub(r\"\\*\", \" \",post)\n", | |
" \n", | |
" post = re.sub(r\" [a-z0-9] \", \" \",post)\n", | |
" # post = re.sub(r\"^[a-z0-9] \", \" \",post)\n", | |
" # post = re.sub(r\" [a-z0-9]$\", \" \",post)\n", | |
" #post = re.sub(r\"[a-z]\", \"\",post)\n", | |
" post = re.sub(r\"&\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = re.sub(r\"\\\\U\\S+\", \" \", post,flags=re.IGNORECASE)\n", | |
" post = \" \".join(post.split())\n", | |
" print \"Axed: \"+post\n", | |
" f.write(post+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 302, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def punct_space_stopword(token):\n", | |
" \"\"\"\n", | |
" helper function to eliminate tokens\n", | |
" that are pure punctuation or whitespace\n", | |
" \"\"\"\n", | |
" \n", | |
" return token.is_punct or token.is_space or token.is_stop\n", | |
"\n", | |
"def post_line(filename):\n", | |
" \"\"\"\n", | |
" generator function to read in post from the file\n", | |
" and un-escape the original line breaks in the text\n", | |
" \"\"\"\n", | |
" \n", | |
" with codecs.open(filename, encoding='utf_8') as f:\n", | |
" for post in f:\n", | |
" yield post.replace('\\\\n', '\\n')\n", | |
" \n", | |
"def lemmatized_sentence_corpus(filename):\n", | |
" \"\"\"\n", | |
" generator function to use spaCy to parse post,\n", | |
" lemmatize the text, and yield sentences\n", | |
" \"\"\"\n", | |
" \n", | |
" for parsed_post in parser.pipe(post_line(filename),\n", | |
" batch_size=10000, n_threads=4):\n", | |
" \n", | |
" for sent in parsed_post.sents:\n", | |
" yield u' '.join([token.lemma_ for token in sent\n", | |
" if not punct_space_stopword(token)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 303, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from gensim.models import Phrases\n", | |
"from gensim.models.word2vec import LineSentence\n", | |
"import pandas as pd\n", | |
"import itertools as it" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 310, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"unigram_sentences_filepath = os.path.join('fbpost_unigram_sentences_all.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 311, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 23.6 s, sys: 248 ms, total: 23.8 s\n", | |
"Wall time: 24.4 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:\n", | |
" for sentence in lemmatized_sentence_corpus(fbpost_filepath):\n", | |
" f.write(sentence + '\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 312, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"unigram_sentences = LineSentence(unigram_sentences_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 313, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"man tell open iranian people .this man adam purinton 51 flee scene attack pub olathe go restaurant confide kill iranian immigrant\n", | |
"\n", | |
"thing indians bear close affinity iranians founder islamic terrorism\n", | |
"\n", | |
"indians maintain indianism tilak wear forehead easily identify hope treat foreign land\n", | |
"\n", | |
"trump plicy annihilation islamic terrorism swips wide scale acclamation world\n", | |
"\n", | |
"question security indian diospora\n", | |
"\n", | |
"view distinguish self india culture manner ward evil\n", | |
"\n", | |
"muslim fanatics bhakts turn india rigid country like saudi arabia\n", | |
"\n", | |
"india secularism root hinduism fundamentally open religion school thought\n", | |
"\n", | |
"core ideology hinduism describe upanishad vasudhaiva kutumbakam world family btw iranians zoroastrian force conversion islam\n", | |
"\n", | |
"victim desi muslims\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for unigram_sentence in it.islice(unigram_sentences, 200, 210):\n", | |
" print u' '.join(unigram_sentence)\n", | |
" print u''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 314, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"bigram_model_filepath = os.path.join('fbpost_bigram_model_all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 315, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"bigram_model = Phrases(unigram_sentences)\n", | |
"bigram_model.save(bigram_model_filepath)\n", | |
"\n", | |
"#loading the model\n", | |
"bigram_model = Phrases.load(bigram_model_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 316, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"bigram_sentences_filepath = os.path.join('fbpost_bigram_sentences_all.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 317, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.12 s, sys: 77.7 ms, total: 1.2 s\n", | |
"Wall time: 1.27 s\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/amantandon/anaconda/lib/python2.7/site-packages/gensim/models/phrases.py:274: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class\n", | |
" warnings.warn(\"For a faster implementation, use the gensim.models.phrases.Phraser class\")\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:\n", | |
" for unigram_sentence in unigram_sentences: \n", | |
" bigram_sentence = u' '.join(bigram_model[unigram_sentence])\n", | |
" f.write(bigram_sentence + '\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 318, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"undo achieve talk theresa donald_trump say petition get 900,000 signature react muslim country strict trump\n", | |
"\n", | |
"people potus visit uk mainly work intent\n", | |
"\n", | |
"extensive method brexit calculations\n", | |
"\n", | |
"boe understand people intention\n", | |
"\n", | |
"go wrong liverpool fc\n", | |
"\n", | |
"official ningbo zoo shoot tiger dead attack man enter enclosure shoot person enter enclosed area sound hea f\n", | |
"\n", | |
"not feel pity idiot r.i.p.\n", | |
"\n", | |
"tiger\n", | |
"\n", | |
"wish not jump area\n", | |
"\n", | |
"alive poor animal pay stupidity mankind idiot b4 shoot tiger compare population tiger population human china\n", | |
"\n", | |
"murder poor tiger fault shame zoo authorities\n", | |
"\n", | |
"tiger bite dust coz lunatic decide hello agree enclosure animal personal space\n", | |
"\n", | |
"intruder shoot occupant bc tranquilizer gun hear .then k run zoo shoot tiger\n", | |
"\n", | |
"fault man enter territory\n", | |
"\n", | |
"hv shoot man enter zoo shame thm\n", | |
"\n", | |
"shoot man instead idiots shoot man\n", | |
"\n", | |
"animal die stupidty man\n", | |
"\n", | |
"can_not believe people not hv mercy animala\n", | |
"\n", | |
"stupidity height\n", | |
"\n", | |
"let tiger home\n", | |
"\n", | |
"bbc china dear\n", | |
"\n", | |
"china\n", | |
"\n", | |
"intense\n", | |
"\n", | |
"widely condemn\n", | |
"\n", | |
"ban ce_ainly save america jihadist terrorism country want save land islamic jihadist terrorism suppo ban enforce ban country\n", | |
"\n", | |
"president trump true nationalist patriot\n", | |
"\n", | |
"world want work interest wait sick hindu pandits suppo idea\n", | |
"\n", | |
"sick hindu comment come\n", | |
"\n", | |
"course go set\n", | |
"\n", | |
"course go feel sick hour\n", | |
"\n", | |
"fantastic match ws fantastic fifth federer\n", | |
"\n", | |
"final naal fully unexpected history nice fifth australian opens\n", | |
"\n", | |
"avignyan mukherjee read caption give icle\n", | |
"\n", | |
"course go feel sick hour\n", | |
"\n", | |
"fantastic match\n", | |
"\n", | |
"simple strong way r best\n", | |
"\n", | |
"happiness laugh matter great initiative\n", | |
"\n", | |
"elevated thought\n", | |
"\n", | |
"dabbawalas india possibly world best food delivery service use clever numerical code ensure lunch deliver ontime right place mumbai bustling metropolis\n", | |
"\n", | |
"manage work systematically\n", | |
"\n", | |
"hip hip hurray mumbaidabbawallas\n", | |
"\n", | |
"thumbs post like\n", | |
"\n", | |
"love read\n", | |
"\n", | |
"probably best management skill\n", | |
"\n", | |
"steph moore michela bernasconi\n", | |
"\n", | |
"secret japan hern wilderness\n", | |
"\n", | |
"trick forget\n", | |
"\n", | |
"donald_trump relationship television date 30_year\n", | |
"\n", | |
"happy country expat\n", | |
"\n", | |
"karan koshy check\n", | |
"\n", | |
"lifeaffirming action leap\n", | |
"\n", | |
"surprising power tiny brain\n", | |
"\n", | |
"happy return day\n", | |
"\n", | |
"god_bless abundantly kerushna bharucha hahahaha\n", | |
"\n", | |
"mystery solve know bumble bee\n", | |
"\n", | |
"not depend size depend power\n", | |
"\n", | |
"radical plan defeat pove_y. think efficient service delivery require money poor pove aggaravated series drought natural calamity\n", | |
"\n", | |
"develop_country instead focus agricultural reform improve employment productivity good icle\n", | |
"\n", | |
"idea alleviate pove welcome\n", | |
"\n", | |
"human nature doubt idea work practice\n", | |
"\n", | |
"welfare state pa idea look society create guessing pollution skyrocket setup environment easy live free instead willing live nature\n", | |
"\n", | |
"plant free nongmo fruit forest city edible mushroom variety plant fall log\n", | |
"\n", | |
"legalize horse city border campingteepee crown land fishing cook eat free.\n", | |
"\n", | |
"india afford need end scheme suppose tackle pov y. yeh bbc not spread propaganda ur country freebie promote lethargy state risk economy\n", | |
"\n", | |
"provide basic money tackle pove good put money circulation economic instability\n", | |
"\n", | |
"state focus provide basic amenities\n", | |
"\n", | |
"firstly definition pove_y\n", | |
"\n", | |
"basic income sufficient?.\n", | |
"\n", | |
"people stop work state bankrupt\n", | |
"\n", | |
"universal basic income plan distributive justice\n", | |
"\n", | |
"bring communism straight forward\n", | |
"\n", | |
"quiet return pen paper\n", | |
"\n", | |
"death valley bizarre landscape\n", | |
"\n", | |
"know afterlife real\n", | |
"\n", | |
"issue\n", | |
"\n", | |
"truth remain truth opinion count\n", | |
"\n", | |
"yes life life real reality\n", | |
"\n", | |
"afterlife paul anindya\n", | |
"\n", | |
"sure\n", | |
"\n", | |
"have\n", | |
"\n", | |
"have prove god not exist?.\n", | |
"\n", | |
"yes life death,,we face that,.\n", | |
"\n", | |
"learn wise\n", | |
"\n", | |
"torah proverbs tree life\n", | |
"\n", | |
"read headline thrice\n", | |
"\n", | |
"can_not believe lol\n", | |
"\n", | |
"fast commercial train planet\n", | |
"\n", | |
"haha china\n", | |
"\n", | |
"german design development sell siemens china\n", | |
"\n", | |
"rubbish wait japanese maglev china near\n", | |
"\n", | |
"man allegedly trap kick muslim airline worker say trump rid world need treatment bad day ahead muslim niharika panwala time ask usa\n", | |
"\n", | |
"answer\n", | |
"\n", | |
"punish guy lock year set example herr dt\n", | |
"\n", | |
"begin truefalse?. happen london hijab clad woman ask deboard bus\n", | |
"\n", | |
"socalled cosmopolitan city soon media destroy world\n", | |
"\n", | |
"guy need nobel peace prize proud trump suppo ers kick islamic freeks want takeover american force conve islamic state guy screw trump mars manipulated s. trump great\n", | |
"\n", | |
"lol go world\n", | |
"\n", | |
"sick uncivilised inhuman\n", | |
"\n", | |
"kaun hai yeh trump rogue???.\n", | |
"\n", | |
"rescuer body friday morning\n", | |
"\n", | |
"deep condolemces brave soldier family\n", | |
"\n", | |
"kashmir death trap india army\n", | |
"\n", | |
"depa_ment homeland security test facial recognition technology biometric scanner detect suspicious traveller arrive us.\n", | |
"\n", | |
"yes need similar measure take security\n", | |
"\n", | |
"restaurant serve unwanted food\n", | |
"\n", | |
"adore like\n", | |
"\n", | |
"lot waste food fit consume esp poor\n", | |
"\n", | |
"ryan sequeira\n", | |
"\n", | |
"advaitha iyer josna d ouza john kure\n", | |
"\n", | |
"make mexico foot republicans $ 1215bn trump key election campaign pledge\n", | |
"\n", | |
"trump bring day thing mode payment sayfucktoglobalization\n", | |
"\n", | |
"forward wholesale credit mexico market price\n", | |
"\n", | |
"wall price 16 time credit\n", | |
"\n", | |
"credit sole reason production quality mexico\n", | |
"\n", | |
"potus mexico borderwall\n", | |
"\n", | |
"mexico sell japan canada instead\n", | |
"\n", | |
"thai people not like\n", | |
"\n", | |
"hit englishlanguage book unknown englishspeaking world\n", | |
"\n", | |
"bird move like ballet dancer\n", | |
"\n", | |
"arfak parotia sound gujarati\n", | |
"\n", | |
"yes nature master choreography\n", | |
"\n", | |
"woman prophesy armageddon\n", | |
"\n", | |
"iconic australian telescope begin major new search et everyday tech help locate signal\n", | |
"\n", | |
"animal scared\n", | |
"\n", | |
"romantic comedy rarely big winner oscars la_la land defy trend\n", | |
"\n", | |
"good movie extraordinary\n", | |
"\n", | |
"la_la land hollywood bollywood_movie song dance romance\n", | |
"\n", | |
"performance not brilliant ryan gosling play ryan gosling emma stone play emma stone la_la land romcom largerthanlife movie beautiful realistic\n", | |
"\n", | |
"bbc india\n", | |
"\n", | |
"not romcom\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"bigram_sentences = LineSentence(bigram_sentences_filepath)\n", | |
"for bigram_sentence in it.islice(bigram_sentences, 2020, 2150):\n", | |
" print u' '.join(bigram_sentence)\n", | |
" print u''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 319, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_model_filepath = os.path.join('fbpost_trigram_model_all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 320, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_model = Phrases(bigram_sentences)\n", | |
"trigram_model.save(trigram_model_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 321, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_model = Phrases.load(trigram_model_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 322, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_sentence_filepath = os.path.join('fbpost_trigram_sentences.txt')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 323, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.07 s, sys: 76.5 ms, total: 1.15 s\n", | |
"Wall time: 1.2 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"with codecs.open(trigram_sentence_filepath, 'w', encoding='utf-8') as f:\n", | |
" for sentence in bigram_sentences:\n", | |
" trigram_sentence = u' '.join(trigram_model[sentence])\n", | |
" f.write(trigram_sentence+'\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 324, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_sentences = LineSentence(trigram_sentence_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 329, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"undo achieve talk theresa donald_trump say petition get 900,000 signature react muslim country strict trump\n", | |
"\n", | |
"people potus visit uk mainly work intent\n", | |
"\n", | |
"extensive method brexit calculations\n", | |
"\n", | |
"boe understand people intention\n", | |
"\n", | |
"go wrong liverpool fc\n", | |
"\n", | |
"official ningbo zoo shoot tiger dead attack man enter enclosure shoot person enter enclosed area sound hea f\n", | |
"\n", | |
"not feel pity idiot r.i.p.\n", | |
"\n", | |
"tiger\n", | |
"\n", | |
"wish not jump area\n", | |
"\n", | |
"alive poor animal pay stupidity mankind idiot b4 shoot tiger compare population tiger population human china\n", | |
"\n", | |
"murder poor tiger fault shame zoo authorities\n", | |
"\n", | |
"tiger bite dust coz lunatic decide hello agree enclosure animal personal space\n", | |
"\n", | |
"intruder shoot occupant bc tranquilizer gun hear .then k run zoo shoot tiger\n", | |
"\n", | |
"fault man enter territory\n", | |
"\n", | |
"hv shoot man enter zoo shame thm\n", | |
"\n", | |
"shoot man instead idiots shoot man\n", | |
"\n", | |
"animal die stupidty man\n", | |
"\n", | |
"can_not_believe people not hv mercy animala\n", | |
"\n", | |
"stupidity height\n", | |
"\n", | |
"let tiger home\n", | |
"\n", | |
"bbc china dear\n", | |
"\n", | |
"china\n", | |
"\n", | |
"intense\n", | |
"\n", | |
"widely condemn\n", | |
"\n", | |
"ban ce_ainly save america jihadist terrorism country want save land islamic jihadist terrorism suppo ban enforce ban country\n", | |
"\n", | |
"president trump true nationalist patriot\n", | |
"\n", | |
"world want work interest wait sick hindu pandits suppo idea\n", | |
"\n", | |
"sick hindu comment come\n", | |
"\n", | |
"course go set\n", | |
"\n", | |
"course go feel sick hour\n", | |
"\n", | |
"fantastic match ws fantastic fifth federer\n", | |
"\n", | |
"final naal fully unexpected history nice fifth australian opens\n", | |
"\n", | |
"avignyan mukherjee read caption give icle\n", | |
"\n", | |
"course go feel sick hour\n", | |
"\n", | |
"fantastic match\n", | |
"\n", | |
"simple strong way r best\n", | |
"\n", | |
"happiness laugh matter great initiative\n", | |
"\n", | |
"elevated thought\n", | |
"\n", | |
"dabbawalas india possibly world best food delivery service use clever numerical code ensure lunch deliver ontime right place mumbai bustling metropolis\n", | |
"\n", | |
"manage work systematically\n", | |
"\n", | |
"hip hip hurray mumbaidabbawallas\n", | |
"\n", | |
"thumbs post like\n", | |
"\n", | |
"love read\n", | |
"\n", | |
"probably best management skill\n", | |
"\n", | |
"steph moore michela bernasconi\n", | |
"\n", | |
"secret japan hern wilderness\n", | |
"\n", | |
"trick forget\n", | |
"\n", | |
"donald_trump relationship television date 30_year\n", | |
"\n", | |
"happy country expat\n", | |
"\n", | |
"karan koshy check\n", | |
"\n", | |
"lifeaffirming action leap\n", | |
"\n", | |
"surprising power tiny brain\n", | |
"\n", | |
"happy return day\n", | |
"\n", | |
"god_bless abundantly kerushna bharucha hahahaha\n", | |
"\n", | |
"mystery solve know bumble bee\n", | |
"\n", | |
"not depend size depend power\n", | |
"\n", | |
"radical plan defeat pove_y. think efficient service delivery require money poor pove aggaravated series drought natural calamity\n", | |
"\n", | |
"develop_country instead focus agricultural reform improve employment productivity good icle\n", | |
"\n", | |
"idea alleviate pove welcome\n", | |
"\n", | |
"human nature doubt idea work practice\n", | |
"\n", | |
"welfare state pa idea look society create guessing pollution skyrocket setup environment easy live free instead willing live nature\n", | |
"\n", | |
"plant free nongmo fruit forest city edible mushroom variety plant fall log\n", | |
"\n", | |
"legalize horse city border campingteepee crown land fishing cook eat free.\n", | |
"\n", | |
"india afford need end scheme suppose tackle pov y. yeh bbc not spread propaganda ur country freebie promote lethargy state risk economy\n", | |
"\n", | |
"provide basic money tackle pove good put money circulation economic instability\n", | |
"\n", | |
"state focus provide basic amenities\n", | |
"\n", | |
"firstly definition pove_y\n", | |
"\n", | |
"basic income sufficient?.\n", | |
"\n", | |
"people stop work state bankrupt\n", | |
"\n", | |
"universal basic income plan distributive justice\n", | |
"\n", | |
"bring communism straight forward\n", | |
"\n", | |
"quiet return pen paper\n", | |
"\n", | |
"death valley bizarre landscape\n", | |
"\n", | |
"know afterlife real\n", | |
"\n", | |
"issue\n", | |
"\n", | |
"truth remain truth opinion count\n", | |
"\n", | |
"yes life life real reality\n", | |
"\n", | |
"afterlife paul anindya\n", | |
"\n", | |
"sure\n", | |
"\n", | |
"have\n", | |
"\n", | |
"have prove god not exist?.\n", | |
"\n", | |
"yes life death,,we face that,.\n", | |
"\n", | |
"learn wise\n", | |
"\n", | |
"torah proverbs tree life\n", | |
"\n", | |
"read headline thrice\n", | |
"\n", | |
"can_not_believe lol\n", | |
"\n", | |
"fast commercial train planet\n", | |
"\n", | |
"haha china\n", | |
"\n", | |
"german design development sell siemens china\n", | |
"\n", | |
"rubbish wait japanese maglev china near\n", | |
"\n", | |
"man allegedly trap kick muslim airline worker say trump rid world need treatment bad day ahead muslim niharika panwala time ask usa\n", | |
"\n", | |
"answer\n", | |
"\n", | |
"punish guy lock year set example herr dt\n", | |
"\n", | |
"begin truefalse?. happen london hijab clad woman ask deboard bus\n", | |
"\n", | |
"socalled cosmopolitan city soon media destroy world\n", | |
"\n", | |
"guy need nobel peace prize proud trump suppo ers kick islamic freeks want takeover american force conve islamic state guy screw trump mars manipulated s. trump great\n", | |
"\n", | |
"lol go world\n", | |
"\n", | |
"sick uncivilised inhuman\n", | |
"\n", | |
"kaun hai yeh trump rogue???.\n", | |
"\n", | |
"rescuer body friday morning\n", | |
"\n", | |
"deep condolemces brave soldier family\n", | |
"\n", | |
"kashmir death trap india army\n", | |
"\n", | |
"depa_ment homeland security test facial recognition technology biometric scanner detect suspicious traveller arrive us.\n", | |
"\n", | |
"yes need similar measure take security\n", | |
"\n", | |
"restaurant serve unwanted food\n", | |
"\n", | |
"adore like\n", | |
"\n", | |
"lot waste food fit consume esp poor\n", | |
"\n", | |
"ryan sequeira\n", | |
"\n", | |
"advaitha iyer josna d ouza john kure\n", | |
"\n", | |
"make mexico foot republicans $ 1215bn trump key election campaign pledge\n", | |
"\n", | |
"trump bring day thing mode payment sayfucktoglobalization\n", | |
"\n", | |
"forward wholesale credit mexico market price\n", | |
"\n", | |
"wall price 16 time credit\n", | |
"\n", | |
"credit sole reason production quality mexico\n", | |
"\n", | |
"potus mexico borderwall\n", | |
"\n", | |
"mexico sell japan canada instead\n", | |
"\n", | |
"thai people not like\n", | |
"\n", | |
"hit englishlanguage book unknown englishspeaking world\n", | |
"\n", | |
"bird move like ballet dancer\n", | |
"\n", | |
"arfak parotia sound gujarati\n", | |
"\n", | |
"yes nature master choreography\n", | |
"\n", | |
"woman prophesy armageddon\n", | |
"\n", | |
"iconic australian telescope begin major new search et everyday tech help locate signal\n", | |
"\n", | |
"animal scared\n", | |
"\n", | |
"romantic comedy rarely big winner oscars la_la_land defy trend\n", | |
"\n", | |
"good movie extraordinary\n", | |
"\n", | |
"la_la_land hollywood bollywood_movie song dance romance\n", | |
"\n", | |
"performance not brilliant ryan gosling play ryan gosling emma stone play emma stone la_la_land romcom largerthanlife movie beautiful realistic\n", | |
"\n", | |
"bbc india\n", | |
"\n", | |
"not romcom\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for unigram_sentence in it.islice(trigram_sentences, 2020, 2150):\n", | |
" print u' '.join(unigram_sentence)\n", | |
" print u''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for parsed_post in parser.pipe(post_line(bigram_sentences_filepath),\n", | |
" batch_size=10000, n_threads=4):\n", | |
" \n", | |
" for num, entity in enumerate(parsed_post.ents):\n", | |
" print 'Entity {}:'.format(num + 1), entity, '-', entity.label_\n", | |
" print ''" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# trying to find out the topics: Topic Modelling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 413, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from gensim.corpora import Dictionary, MmCorpus\n", | |
"from gensim.models.ldamulticore import LdaMulticore\n", | |
"import gensim\n", | |
"\n", | |
"import pyLDAvis\n", | |
"import pyLDAvis.gensim\n", | |
"import warnings\n", | |
"import cPickle as pickle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 414, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#creating the dictionary\n", | |
"trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 454, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 581 ms, sys: 25 ms, total: 606 ms\n", | |
"Wall time: 624 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"\n", | |
"trigram_posts = LineSentence(trigram_sentence_filepath)\n", | |
"\n", | |
"# learn the dictionary by iterating over all of the post\n", | |
"trigram_dictionary = Dictionary(trigram_posts)\n", | |
" \n", | |
"# filter tokens that are very rare or too common from\n", | |
"# the dictionary (filter_extremes) and reassign integer ids (compactify)\n", | |
"trigram_dictionary.filter_extremes(no_below=20, no_above=0.4)\n", | |
"trigram_dictionary.compactify()\n", | |
"\n", | |
"trigram_dictionary.save(trigram_dictionary_filepath)\n", | |
" \n", | |
"# load the finished dictionary from disk\n", | |
"trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 457, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 458, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def trigram_bow_generator(filepath):\n", | |
" \"\"\"\n", | |
" generator function to read post from a file\n", | |
" and yield a bag-of-words representation\n", | |
" \"\"\"\n", | |
" \n", | |
" for post in LineSentence(filepath):\n", | |
" yield trigram_dictionary.doc2bow(post)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 459, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 920 ms, sys: 83.2 ms, total: 1 s\n", | |
"Wall time: 1.11 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# generate bag-of-words representations for\n", | |
"# all post and save them as a matrix\n", | |
"MmCorpus.serialize(trigram_bow_filepath,\n", | |
" trigram_bow_generator(trigram_sentence_filepath))\n", | |
" \n", | |
"# load the finished bag-of-words corpus from disk\n", | |
"trigram_bow_corpus = MmCorpus(trigram_bow_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 460, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"lda_model_filepath = os.path.join('lda_model_all')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 461, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"WARNING:gensim.models.ldamulticore:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4.16 s, sys: 2.68 s, total: 6.85 s\n", | |
"Wall time: 7.48 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"\n", | |
"with warnings.catch_warnings():\n", | |
" warnings.simplefilter('ignore')\n", | |
" \n", | |
" # workers => sets the parallelism, and should be\n", | |
" # set to your number of physical cores minus one\n", | |
" lda = LdaMulticore(trigram_bow_corpus,\n", | |
" num_topics=20,\n", | |
" id2word=trigram_dictionary,\n", | |
" workers=3)\n", | |
" \n", | |
"lda.save(lda_model_filepath)\n", | |
" \n", | |
"# load the finished LDA model from disk\n", | |
"lda = LdaMulticore.load(lda_model_filepath)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 462, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(0,\n", | |
" u'0.031*\"india\" + 0.020*\"business\" + 0.017*\"bbc\" + 0.016*\"indian\" + 0.013*\"people\" + 0.011*\"lot\" + 0.011*\"get\" + 0.010*\"phone\" + 0.008*\"brand\" + 0.008*\"not\"'),\n", | |
" (1,\n", | |
" u'0.020*\"happen\" + 0.017*\"great\" + 0.013*\"bbc\" + 0.013*\"bbc_travel\" + 0.012*\"india\" + 0.009*\"mobile_phone\" + 0.009*\"world\" + 0.009*\"people\" + 0.009*\"repo\" + 0.009*\"fast\"'),\n", | |
" (2,\n", | |
" u'0.031*\"india\" + 0.029*\"people\" + 0.022*\"work\" + 0.020*\"car\" + 0.017*\"indian\" + 0.013*\"stop\" + 0.013*\"not\" + 0.012*\"bbc\" + 0.010*\"need\" + 0.010*\"country\"'),\n", | |
" (3,\n", | |
" u'0.032*\"india\" + 0.030*\"people\" + 0.016*\"live\" + 0.014*\"like\" + 0.013*\"woman\" + 0.013*\"channel\" + 0.011*\"hi\" + 0.010*\"not\" + 0.010*\"wonder\" + 0.009*\"good\"'),\n", | |
" (4,\n", | |
" u'0.059*\"india\" + 0.019*\"woman\" + 0.014*\"best\" + 0.013*\"man\" + 0.012*\"world\" + 0.010*\"air\" + 0.009*\"girl\" + 0.008*\"kashmir\" + 0.008*\"oh\" + 0.008*\"point\"'),\n", | |
" (5,\n", | |
" u'0.029*\"time\" + 0.022*\"look\" + 0.021*\"india\" + 0.020*\"gear\" + 0.018*\"love\" + 0.013*\"hi\" + 0.013*\"good\" + 0.013*\"device\" + 0.011*\"suppo\" + 0.011*\"guy\"'),\n", | |
" (6,\n", | |
" u'0.020*\"new\" + 0.019*\"dance\" + 0.017*\"come\" + 0.014*\"india\" + 0.013*\"sad\" + 0.012*\"recent\" + 0.011*\"series\" + 0.010*\"watch\" + 0.010*\"day\" + 0.010*\"enjoy\"'),\n", | |
" (7,\n", | |
" u'0.037*\"indian\" + 0.026*\"like\" + 0.017*\"india\" + 0.015*\"people\" + 0.015*\"come\" + 0.011*\"know\" + 0.011*\"month\" + 0.010*\"new\" + 0.009*\"good\" + 0.008*\"way\"'),\n", | |
" (8,\n", | |
" u'0.050*\"india\" + 0.017*\"bbc\" + 0.016*\"people\" + 0.016*\"think\" + 0.015*\"indian\" + 0.013*\"world\" + 0.012*\"fan\" + 0.010*\"time\" + 0.010*\"great\" + 0.009*\"call\"'),\n", | |
" (9,\n", | |
" u'0.047*\"like\" + 0.020*\"india\" + 0.016*\"day\" + 0.014*\"come\" + 0.013*\"way\" + 0.013*\"thank\" + 0.012*\"bring\" + 0.011*\"country\" + 0.010*\"thing\" + 0.010*\"travel\"'),\n", | |
" (10,\n", | |
" u'0.043*\"india\" + 0.013*\"indian\" + 0.012*\"need\" + 0.012*\"people\" + 0.010*\"go\" + 0.010*\"world\" + 0.009*\"woman\" + 0.009*\"year\" + 0.009*\"not\" + 0.009*\"night\"'),\n", | |
" (11,\n", | |
" u'0.030*\"world\" + 0.021*\"pa\" + 0.018*\"human\" + 0.018*\"thanks\" + 0.018*\"yes\" + 0.014*\"indian\" + 0.014*\"bbc\" + 0.014*\"nice\" + 0.013*\"programme\" + 0.012*\"great\"'),\n", | |
" (12,\n", | |
" u'0.022*\"country\" + 0.012*\"reach\" + 0.011*\"india\" + 0.011*\"people\" + 0.011*\"time\" + 0.010*\"poor\" + 0.009*\"get\" + 0.008*\"village\" + 0.008*\"connection\" + 0.008*\"check\"'),\n", | |
" (13,\n", | |
" u'0.050*\"india\" + 0.028*\"city\" + 0.013*\"love\" + 0.012*\"life\" + 0.011*\"bbc\" + 0.010*\"way\" + 0.010*\"watch\" + 0.009*\"build\" + 0.009*\"technology\" + 0.008*\"make\"'),\n", | |
" (14,\n", | |
" u'0.033*\"india\" + 0.024*\"indian\" + 0.022*\"not\" + 0.016*\"people\" + 0.012*\"home\" + 0.012*\"say\" + 0.011*\"like\" + 0.011*\"high\" + 0.010*\"happy\" + 0.009*\"need\"'),\n", | |
" (15,\n", | |
" u'0.022*\"go\" + 0.020*\"people\" + 0.020*\"photographer\" + 0.016*\"life\" + 0.013*\"doctor\" + 0.012*\"india\" + 0.012*\"year\" + 0.011*\"think\" + 0.010*\"check\" + 0.009*\"hope\"'),\n", | |
" (16,\n", | |
" u'0.045*\"india\" + 0.021*\"not\" + 0.019*\"bbc\" + 0.011*\"country\" + 0.010*\"series\" + 0.010*\"world\" + 0.009*\"think\" + 0.009*\"people\" + 0.008*\"want\" + 0.008*\"travel\"'),\n", | |
" (17,\n", | |
" u'0.033*\"come\" + 0.030*\"india\" + 0.020*\"not\" + 0.020*\"know\" + 0.018*\"want\" + 0.013*\"bbc\" + 0.010*\"life\" + 0.010*\"work\" + 0.009*\"try\" + 0.009*\"help\"'),\n", | |
" (18,\n", | |
" u'0.019*\"car\" + 0.014*\"india\" + 0.014*\"company\" + 0.011*\"people\" + 0.011*\"wow\" + 0.011*\"site\" + 0.011*\"live\" + 0.011*\"country\" + 0.010*\"world\" + 0.010*\"miss\"'),\n", | |
" (19,\n", | |
" u'0.090*\"india\" + 0.018*\"grow\" + 0.015*\"bbc\" + 0.015*\"british\" + 0.014*\"season\" + 0.010*\"take\" + 0.009*\"sta\" + 0.009*\"help\" + 0.009*\"world\" + 0.008*\"place\"')]" | |
] | |
}, | |
"execution_count": 462, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lda.print_topics(-1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 463, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tfidf_model = gensim.models.TfidfModel(trigram_bow_corpus, id2word=trigram_dictionary)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 484, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 33.1 s, sys: 1.94 s, total: 35.1 s\n", | |
"Wall time: 19.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"lsi_model = gensim.models.LsiModel(tfidf_model[trigram_bow_corpus], id2word=trigram_dictionary, num_topics=3000)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 485, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def explore_topic_lsi(topic_number, topn=25):\n", | |
" \"\"\"\n", | |
" accept a user-supplied topic number and\n", | |
" print out a formatted list of the top terms\n", | |
" \"\"\"\n", | |
" \n", | |
" print u'{:20} {}'.format(u'term', u'frequency') + u'\\n'\n", | |
"\n", | |
" for term, frequency in lsi_model.show_topic(topic_number, topn=25):\n", | |
" print u'{:20} {:.3f}'.format(term, round(frequency, 3))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 486, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def explore_topic(topic_number, topn=25):\n", | |
" \"\"\"\n", | |
" accept a user-supplied topic number and\n", | |
" print out a formatted list of the top terms\n", | |
" \"\"\"\n", | |
" \n", | |
" print u'{:20} {}'.format(u'term', u'frequency') + u'\\n'\n", | |
"\n", | |
" for term, frequency in lda.show_topic(topic_number, topn=25):\n", | |
" print u'{:20} {:.3f}'.format(term, round(frequency, 3))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 487, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"term frequency\n", | |
"\n", | |
"india 0.090\n", | |
"grow 0.018\n", | |
"bbc 0.015\n", | |
"british 0.015\n", | |
"season 0.014\n", | |
"take 0.010\n", | |
"sta 0.009\n", | |
"help 0.009\n", | |
"world 0.009\n", | |
"place 0.008\n", | |
"like 0.008\n", | |
"discover 0.007\n", | |
"today 0.007\n", | |
"play 0.007\n", | |
"channel 0.006\n", | |
"delhi 0.006\n", | |
"say 0.006\n", | |
"news 0.006\n", | |
"thing 0.005\n", | |
"work 0.005\n", | |
"dr 0.005\n", | |
"pa 0.005\n", | |
"human 0.005\n", | |
"late 0.005\n", | |
"performance 0.005\n" | |
] | |
} | |
], | |
"source": [ | |
"explore_topic(topic_number=19)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 488, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"term frequency\n", | |
"\n", | |
"ask -0.329\n", | |
"police -0.322\n", | |
"bjp 0.305\n", | |
"win 0.211\n", | |
"picture 0.198\n", | |
"hindu 0.189\n", | |
"save -0.188\n", | |
"lot -0.176\n", | |
"question -0.169\n", | |
"guy -0.169\n", | |
"animal -0.167\n", | |
"hea 0.162\n", | |
"home 0.150\n", | |
"respect -0.126\n", | |
"poor 0.125\n", | |
"delhi -0.123\n", | |
"god_bless 0.120\n", | |
"water -0.110\n", | |
"miss 0.104\n", | |
"believe 0.102\n", | |
"word 0.101\n", | |
"suppo -0.093\n", | |
"kashmir -0.088\n", | |
"ban -0.087\n", | |
"job -0.087\n" | |
] | |
} | |
], | |
"source": [ | |
"explore_topic_lsi(topic_number=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 500, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"term frequency\n", | |
"\n", | |
"hold 0.232\n", | |
"destroy 0.199\n", | |
"temple 0.181\n", | |
"political 0.180\n", | |
"tiger 0.173\n", | |
"shoot -0.166\n", | |
"crime 0.161\n", | |
"step 0.148\n", | |
"village 0.139\n", | |
"democracy -0.131\n", | |
"fool -0.128\n", | |
"white -0.128\n", | |
"student -0.124\n", | |
"wear 0.123\n", | |
"enjoy -0.122\n", | |
"level 0.119\n", | |
"away 0.115\n", | |
"sign -0.113\n", | |
"arrest -0.112\n", | |
"product -0.111\n", | |
"set 0.107\n", | |
"number -0.104\n", | |
"action -0.102\n", | |
"president -0.102\n", | |
"example 0.100\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"NoneType" | |
] | |
}, | |
"execution_count": 500, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"type(explore_topic_lsi(topic_number=300))#700" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 502, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"list" | |
] | |
}, | |
"execution_count": 502, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"type(lsi_model.show_topic(300))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment