Created June 17, 2011 02:54
Applying LSI to Twitter search
import json
import urllib
import urllib2
from gensim import corpora, models, similarities
import logging
import sys
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure
logging.root.setLevel(logging.INFO) # will suppress DEBUG level events
def main():
num_tweets = sys.argv[1]
query = sys.argv[2]
num_lsi_topics = int(sys.argv[3])
lsi_twitter(num_tweets, query, num_lsi_topics)
def lsi_twitter(num_tweets, query, num_lsi_topics):
tweets = twitter_search(num_tweets, query)
run_lsi(tweets, num_lsi_topics)
def twitter_search(num_tweets, query):
trends_url = ''
params = urllib.urlencode({
'rpp': num_tweets,
'q': query,
'lang': 'en',
url = '%s?%s' % (trends_url, params)
response = urllib2.urlopen(url, timeout=10)
content =
data = json.loads(content)
tweets = [r['text'] for r in data['results']]
return tweets
def run_lsi(tweets, num_lsi_topics):
#stoplist and words that occur once
stoplist = set("for a of the and to in you're should have will do it's this i your is what".split())
allTokens = ''.join(tweets)
tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1)
#split each text on whitespace, filtering out some stopwords
split_tweets = [[word for word in tweet.lower().split()
if word not in stoplist and word not in tokensOnce]
for tweet in tweets]
#map words in corpus to integer IDs, record word frequency
dictionary = corpora.Dictionary(split_tweets)'twitter.dict') # store the dictionary, for future reference
#convert texts to term vectors via dictionary
corpus = [dictionary.doc2bow(st) for st in split_tweets]
#corpora.MmCorpus.serialize('', corpus) # store to disk, for later use
#initialize model
tfidf = models.TfidfModel(corpus)
#use model to transform term vectors to tfidf vectors
corpus_tfidf = tfidf[corpus]
#two topic lsi model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, numTopics=num_lsi_topics)
lsi_vecs = lsi[corpus_tfidf]
vecs_by_topic = [[]] * num_lsi_topics
#display each tweet and associated lsi topic weights
for i, vec in enumerate(lsi_vecs):
max_vec = max(vec, key=lambda (topic, weight): abs(weight))
max_topic = max_vec[0]
vecs_by_topic[max_topic].append((tweets[i], vec))
for topic, results in enumerate(vecs_by_topic):
print '\nTopic %i:\n' % (topic)
for result in results:
print result[0]
print result[1]
print ''
if num_lsi_topics == 2:
fig = Figure(figsize=(10.0,10.0))
canvas = FigureCanvas(fig)
ax = fig.add_subplot(111)
xs = [vec[0][1] for vec in lsi_vecs]
ys = [vec[1][1] for vec in lsi_vecs]
ax.scatter(xs, ys)
ax.set_title('Twitter LSI')
ax.set_xlabel('topic 0')
ax.set_ylabel('topic 1')
#not using this now
def print_tfidf(tweets, corpus_tfidf, dictionary):
#convert from TfidfModel to list of tuples
tfidf_list = [doc for doc in corpus_tfidf]
#get list of (key,value) pairs from dictionary to look-up terms by id
dict_items = dictionary.token2id.items()
#find first key in the dictionary with value term_id
def id2token(term_id):
return filter(lambda (k,v): v == term_id, dict_items)[0][0]
for i in range(len(tweets)):
print 'Result %i: %s' % (i,tweets[i])
tfidf_vec = tfidf_list[i]
tfidf_vec.sort(key=lambda term: term[1])
#top_terms = tfidf_vec[:5]
for term in tfidf_vec: #top_terms:
print '%s: %f' % (id2token(term[0]),term[1])
print ''
if __name__ == "__main__":
