Skip to content

Instantly share code, notes, and snippets.

@jaidevd
Created October 2, 2012 03:47
Show Gist options
  • Save jaidevd/3816053 to your computer and use it in GitHub Desktop.
Save jaidevd/3816053 to your computer and use it in GitHub Desktop.
Basic extraction, analysis and visualization of PyconIndia 2012 tweets
#!/usr/bin/env python
import os
import json
import numpy as np
from pandas import DataFrame, concat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from matplotlib.pyplot import imshow, plot, show, figure, title, yticks
# Download this file from https://gist.github.com/3809261
filename = os.path.join('pyconindia2012.json')
f = open(filename, 'r')
###############################################################################
# Decoding the json file, getting the data in a pandas dataframe and saving it as
# an xls file.
###############################################################################
tweets = []
for line in f:
tweets.append(json.loads(line))
tweeters = []
texts = []
timestamps = []
metadata = []
for tweet in tweets:
tweeters.append(tweet['from_user_name'])
texts.append(tweet['text'])
timestamps.append(tweet['created_at'])
metadata.append(tweet['metadata']['result_type'])
tweet_dict = {
'tweeters':tweeters, 'texts':texts, 'timestamps':timestamps,
'metadata':metadata
}
df = DataFrame(tweet_dict)
df.to_excel('pycontweets_pandas.xls')
###############################################################################
# Processing the text in tweets to remove redundancies
###############################################################################
# These characters are unwated in the words in a tweet.
unchars = ['@','#','http', 'rt', 'RT', 'and', 'at', 'by', 'for', 'in', 'is',
'of','on','the', 'two', 'with', 'to', 'was', 'day', 'will', 'it', 'who',
'had']
proc_text = []
for text in texts:
words = text.split(' ')
wordlist = []
for word in words:
if np.prod([unchar not in word for unchar in unchars]):
wordlist.append(word)
s = ''
for word in wordlist:
s += word + ' '
proc_text.append(s)
df['texts'] = proc_text
###############################################################################
# Tokenizing and analyzing the text in tweets
###############################################################################
vectorizer = TfidfVectorizer()
text_vectorized = vectorizer.fit_transform(proc_text)
tv_sum = np.sum(text_vectorized.toarray(), axis=0)
plot(tv_sum)
show()
thresh = input('Enter Threshold:\n')
inds = tv_sum > thresh
keyword_inds = []
for i in range(len(inds)):
if inds[i]:
keyword_inds.append(i)
keywords = []
for key in vectorizer.vocabulary_:
if vectorizer.vocabulary_[key] in keyword_inds:
keywords.append(key)
for keyword in keywords:
if len(keyword)<3:
keywords.remove(keyword)
keyword_inds.remove(vectorizer.vocabulary_[keyword])
imshow(text_vectorized.toarray().T, aspect='auto')
yticks(keyword_inds, tuple(keywords), rotation=0)
title('Image Plot of Words in tweets')
show()
# Making a PCA plot:
pca = PCA(2, whiten=True)
pc_red = pca.fit_transform(text_vectorized.toarray())
figure()
plot(pc_red[:,0], pc_red[:,1], 'ro')
title('PCA Plot')
show()
# Performing K-means clustering
k = input('Input cluster numbers:\n')
km = KMeans(2)
km.fit(pc_red)
figure()
plot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 'r+', markersize=20)
plot(pc_red[:,0], pc_red[:,1], 'bo')
show()
@jaidevd
Copy link
Author

jaidevd commented Feb 18, 2013

Whoa... saw this comment just now, five months later. Don't even remember what I did in that script!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment