jaidevd · October 2, 2012 03:47 · kracekumar · Oct 2, 2012 · jaidevd · Feb 18, 2013
diff --git a/pycon.py b/pycon.py
 #!/usr/bin/env python

 import os
 import json
 import numpy as np
 from pandas import DataFrame, concat
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import PCA
 from sklearn.cluster import KMeans
 from matplotlib.pyplot import imshow, plot, show, figure, title, yticks


 # Download this file from https://gist.github.com/3809261
 filename = os.path.join('pyconindia2012.json')
 f = open(filename, 'r')

 ###############################################################################
 # Decoding the json file, getting the data in a pandas dataframe and saving it as
 # an xls file.
 ###############################################################################

 tweets = []
 for line in f:
    tweets.append(json.loads(line))

 tweeters = []
 texts = []
 timestamps = []
 metadata = []
 for tweet in tweets:
    tweeters.append(tweet['from_user_name'])
    texts.append(tweet['text'])
    timestamps.append(tweet['created_at'])
    metadata.append(tweet['metadata']['result_type'])

 tweet_dict = {
    'tweeters':tweeters, 'texts':texts, 'timestamps':timestamps,
    'metadata':metadata
 }

 df = DataFrame(tweet_dict)
 df.to_excel('pycontweets_pandas.xls')



 ###############################################################################
 # Processing the text in tweets to remove redundancies
 ###############################################################################

 # These characters are unwated in the words in a tweet.
 unchars = ['@','#','http', 'rt', 'RT', 'and', 'at', 'by', 'for', 'in', 'is',
           'of','on','the', 'two', 'with', 'to', 'was', 'day', 'will', 'it', 'who',
           'had']

 proc_text = []

 for text in texts:
    words = text.split(' ')
    wordlist = []
    for word in words:
        if np.prod([unchar not in word for unchar in unchars]):
            wordlist.append(word)
    s = ''
    for word in wordlist:
        s += word + ' '
    proc_text.append(s)

 df['texts'] = proc_text



 ###############################################################################
 # Tokenizing and analyzing the text in tweets
 ###############################################################################

 vectorizer = TfidfVectorizer()
 text_vectorized = vectorizer.fit_transform(proc_text)
 tv_sum = np.sum(text_vectorized.toarray(), axis=0)

 plot(tv_sum)
 show()

 thresh = input('Enter Threshold:\n')

 inds = tv_sum > thresh

 keyword_inds = []
 for i in range(len(inds)):
    if inds[i]:
        keyword_inds.append(i)

 keywords = []
 for key in vectorizer.vocabulary_:
    if vectorizer.vocabulary_[key] in keyword_inds:
        keywords.append(key)

 for keyword in keywords:
    if len(keyword)<3:
        keywords.remove(keyword)
        keyword_inds.remove(vectorizer.vocabulary_[keyword])


 imshow(text_vectorized.toarray().T, aspect='auto')
 yticks(keyword_inds, tuple(keywords), rotation=0)
 title('Image Plot of Words in tweets')
 show()


 # Making a PCA plot:

 pca = PCA(2, whiten=True)
 pc_red = pca.fit_transform(text_vectorized.toarray())

 figure()
 plot(pc_red[:,0], pc_red[:,1], 'ro')
 title('PCA Plot')
 show()

 # Performing K-means clustering
 k = input('Input cluster numbers:\n')
 km = KMeans(2)
 km.fit(pc_red)

 figure()
 plot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 'r+', markersize=20)
 plot(pc_red[:,0], pc_red[:,1], 'bo')
 show()
	#!/usr/bin/env python

	import os
	import json
	import numpy as np
	from pandas import DataFrame, concat
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans
	from matplotlib.pyplot import imshow, plot, show, figure, title, yticks


	# Download this file from https://gist.github.com/3809261
	filename = os.path.join('pyconindia2012.json')
	f = open(filename, 'r')

	###############################################################################
	# Decoding the json file, getting the data in a pandas dataframe and saving it as
	# an xls file.
	###############################################################################

	tweets = []
	for line in f:
	tweets.append(json.loads(line))

	tweeters = []
	texts = []
	timestamps = []
	metadata = []
	for tweet in tweets:
	tweeters.append(tweet['from_user_name'])
	texts.append(tweet['text'])
	timestamps.append(tweet['created_at'])
	metadata.append(tweet['metadata']['result_type'])

	tweet_dict = {
	'tweeters':tweeters, 'texts':texts, 'timestamps':timestamps,
	'metadata':metadata
	}

	df = DataFrame(tweet_dict)
	df.to_excel('pycontweets_pandas.xls')



	###############################################################################
	# Processing the text in tweets to remove redundancies
	###############################################################################

	# These characters are unwated in the words in a tweet.
	unchars = ['@','#','http', 'rt', 'RT', 'and', 'at', 'by', 'for', 'in', 'is',
	'of','on','the', 'two', 'with', 'to', 'was', 'day', 'will', 'it', 'who',
	'had']

	proc_text = []

	for text in texts:
	words = text.split(' ')
	wordlist = []
	for word in words:
	if np.prod([unchar not in word for unchar in unchars]):
	wordlist.append(word)
	s = ''
	for word in wordlist:
	s += word + ' '
	proc_text.append(s)

	df['texts'] = proc_text



	###############################################################################
	# Tokenizing and analyzing the text in tweets
	###############################################################################

	vectorizer = TfidfVectorizer()
	text_vectorized = vectorizer.fit_transform(proc_text)
	tv_sum = np.sum(text_vectorized.toarray(), axis=0)

	plot(tv_sum)
	show()

	thresh = input('Enter Threshold:\n')

	inds = tv_sum > thresh

	keyword_inds = []
	for i in range(len(inds)):
	if inds[i]:
	keyword_inds.append(i)

	keywords = []
	for key in vectorizer.vocabulary_:
	if vectorizer.vocabulary_[key] in keyword_inds:
	keywords.append(key)

	for keyword in keywords:
	if len(keyword)<3:
	keywords.remove(keyword)
	keyword_inds.remove(vectorizer.vocabulary_[keyword])


	imshow(text_vectorized.toarray().T, aspect='auto')
	yticks(keyword_inds, tuple(keywords), rotation=0)
	title('Image Plot of Words in tweets')
	show()


	# Making a PCA plot:

	pca = PCA(2, whiten=True)
	pc_red = pca.fit_transform(text_vectorized.toarray())

	figure()
	plot(pc_red[:,0], pc_red[:,1], 'ro')
	title('PCA Plot')
	show()

	# Performing K-means clustering
	k = input('Input cluster numbers:\n')
	km = KMeans(2)
	km.fit(pc_red)

	figure()
	plot(km.cluster_centers_[:,0], km.cluster_centers_[:,1], 'r+', markersize=20)
	plot(pc_red[:,0], pc_red[:,1], 'bo')
	show()