Created
July 24, 2017 03:50
-
-
Save santteegt/dbaf12b5e01e0e49f8536014ff760493 to your computer and use it in GitHub Desktop.
Data Incubator fellowship - My Project data analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import nltk\n", | |
"from bs4 import BeautifulSoup\n", | |
"import re\n", | |
"import os\n", | |
"import codecs\n", | |
"from sklearn import feature_extraction\n", | |
"import mpld3" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"files = ['tweets_enlace130517_.txt', 'tweets_enlace200517_.txt']\n", | |
"raw_tweets = []\n", | |
"bad_coded_tweets = 0\n", | |
"for file in files:\n", | |
" with open('twitter_data/%s' % file, 'r') as f:\n", | |
" tweets = f.readlines()\n", | |
" for tw in tweets:\n", | |
" try:\n", | |
" raw_tweets.append(json.loads(tw))\n", | |
" except Exception as e:\n", | |
" bad_coded_tweets += 1\n", | |
" f.close()\n", | |
"print(bad_coded_tweets)\n", | |
"len(raw_tweets)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tweet_text = [{'id': tw['id'], 'name': '@'+tw['user']['name'], 'text': tw['text']} for tw in raw_tweets]\n", | |
"len(tweet_text)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.io.json.json_normalize(tweet_text)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"ranks = range(df.shape[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"stopwords = nltk.corpus.stopwords.words('spanish')\n", | |
"from nltk.stem.snowball import SnowballStemmer\n", | |
"stemmer = SnowballStemmer(\"spanish\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def tokenize_and_stem(text):\n", | |
" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n", | |
" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n", | |
" filtered_tokens = []\n", | |
" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n", | |
" for token in tokens:\n", | |
" if re.search('[a-zA-Z]', token):\n", | |
" filtered_tokens.append(token)\n", | |
" stems = [stemmer.stem(t) for t in filtered_tokens]\n", | |
" return stems\n", | |
"\n", | |
"\n", | |
"def tokenize_only(text):\n", | |
" # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n", | |
" tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n", | |
" filtered_tokens = []\n", | |
" # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n", | |
" for token in tokens:\n", | |
" if re.search('[a-zA-Z]', token):\n", | |
" filtered_tokens.append(token)\n", | |
" return filtered_tokens" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"totalvocab_stemmed = []\n", | |
"totalvocab_tokenized = []\n", | |
"for t in df['text'].values:\n", | |
" allwords_stemmed = tokenize_and_stem(t)\n", | |
" totalvocab_stemmed.extend(allwords_stemmed)\n", | |
" \n", | |
" allwords_tokenized = tokenize_only(t)\n", | |
" totalvocab_tokenized.extend(allwords_tokenized)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_vocab = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_vocab.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"\n", | |
"tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,\n", | |
" min_df=0.02, stop_words=stopwords,\n", | |
" use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n", | |
"\n", | |
"%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'].values)\n", | |
"\n", | |
"print(tfidf_matrix.shape)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"terms = tfidf_vectorizer.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import cosine_similarity\n", | |
"dist = 1 - cosine_similarity(tfidf_matrix)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cluster import KMeans\n", | |
"\n", | |
"num_clusters = 5\n", | |
"km = KMeans(n_clusters=num_clusters)\n", | |
"%time km.fit(tfidf_matrix)\n", | |
"clusters = km.labels_.tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"films = { 'id': df['id'].values, 'name': df['name'].values, 'rank': ranks, 'text': df['text'].values, 'cluster': clusters }\n", | |
"frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'id', 'name', 'cluster'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"frame['cluster'].value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"grouped = frame['rank'].groupby(frame['cluster'])\n", | |
"grouped.mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import print_function\n", | |
"\n", | |
"print(\"Top terms per cluster:\")\n", | |
"print()\n", | |
"order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", | |
"for i in range(num_clusters):\n", | |
" print(\"Cluster %d words:\" % i, end='')\n", | |
" for ind in order_centroids[i, :6]:\n", | |
" print(' %s' % df_vocab.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')\n", | |
" print()\n", | |
" print()\n", | |
" print(\"Cluster %d names:\" % i, end='')\n", | |
" for name in frame.ix[i]['name'].values.tolist():\n", | |
" print(' %s,' % name, end='')\n", | |
" print()\n", | |
" print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"frame['Rank'] = frame['rank'] + 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib as mpl\n", | |
"\n", | |
"from sklearn.manifold import MDS\n", | |
"\n", | |
"MDS()\n", | |
"\n", | |
"# two components as we're plotting points in a two-dimensional plane\n", | |
"# \"precomputed\" because we provide a distance matrix\n", | |
"# we will also specify `random_state` so the plot is reproducible.\n", | |
"mds = MDS(n_components=2, dissimilarity=\"precomputed\", random_state=1)\n", | |
"\n", | |
"pos = mds.fit_transform(dist) # shape (n_components, n_samples)\n", | |
"\n", | |
"xs, ys = pos[:, 0], pos[:, 1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from nltk.tag import pos_tag\n", | |
"\n", | |
"def strip_proppers_POS(text):\n", | |
" tagged = pos_tag(text.split()) #use NLTK's part of speech tagger\n", | |
" non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']\n", | |
" return non_propernouns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"class TopToolbar(mpld3.plugins.PluginBase):\n", | |
" \"\"\"Plugin for moving toolbar to top of figure\"\"\"\n", | |
"\n", | |
" JAVASCRIPT = \"\"\"\n", | |
" mpld3.register_plugin(\"toptoolbar\", TopToolbar);\n", | |
" TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);\n", | |
" TopToolbar.prototype.constructor = TopToolbar;\n", | |
" function TopToolbar(fig, props){\n", | |
" mpld3.Plugin.call(this, fig, props);\n", | |
" };\n", | |
"\n", | |
" TopToolbar.prototype.draw = function(){\n", | |
" // the toolbar svg doesn't exist\n", | |
" // yet, so first draw it\n", | |
" this.fig.toolbar.draw();\n", | |
"\n", | |
" // then change the y position to be\n", | |
" // at the top of the figure\n", | |
" this.fig.toolbar.toolbar.attr(\"x\", 150);\n", | |
" this.fig.toolbar.toolbar.attr(\"y\", 400);\n", | |
"\n", | |
" // then remove the draw function,\n", | |
" // so that it is not called again\n", | |
" this.fig.toolbar.draw = function() {}\n", | |
" }\n", | |
" \"\"\"\n", | |
" def __init__(self):\n", | |
" self.dict_ = {\"type\": \"toptoolbar\"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) \n", | |
"\n", | |
"#group by cluster\n", | |
"groups = df.groupby('label')\n", | |
"\n", | |
"#define custom css to format the font and to remove the axis labeling\n", | |
"css = \"\"\"\n", | |
"text.mpld3-text, div.mpld3-tooltip {\n", | |
" font-family:Arial, Helvetica, sans-serif;\n", | |
"}\n", | |
"\n", | |
"g.mpld3-xaxis, g.mpld3-yaxis {\n", | |
"display: none; }\n", | |
"\"\"\"\n", | |
"\n", | |
"# Plot \n", | |
"fig, ax = plt.subplots(figsize=(14,6)) #set plot size\n", | |
"ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling\n", | |
"\n", | |
"#iterate through groups to layer the plot\n", | |
"#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label\n", | |
"for name, group in groups:\n", | |
" points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, label=cluster_names[name], mec='none', color=cluster_colors[name])\n", | |
" ax.set_aspect('auto')\n", | |
" labels = [i for i in group.title]\n", | |
" \n", | |
" #set tooltip using points, labels and the already defined 'css'\n", | |
" tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,\n", | |
" voffset=10, hoffset=10, css=css)\n", | |
" #connect tooltip to fig\n", | |
" mpld3.plugins.connect(fig, tooltip, TopToolbar()) \n", | |
" \n", | |
" #set tick marks as blank\n", | |
" ax.axes.get_xaxis().set_ticks([])\n", | |
" ax.axes.get_yaxis().set_ticks([])\n", | |
" \n", | |
" #set axis as blank\n", | |
" ax.axes.get_xaxis().set_visible(False)\n", | |
" ax.axes.get_yaxis().set_visible(False)\n", | |
"\n", | |
" \n", | |
"ax.legend(numpoints=1) #show legend with only one dot\n", | |
"\n", | |
"mpld3.display() #show the plot\n", | |
"\n", | |
"#uncomment the below to export to html\n", | |
"#html = mpld3.fig_to_html(fig)\n", | |
"#print(html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"a" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda env:NLP]", | |
"language": "python", | |
"name": "conda-env-NLP-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment