Created June 21, 2011 04:44
Twitter LSI
import json
import urllib
import urllib2
from gensim import corpora, models, similarities
import logging
import sys
# First 20 Twitter search results for "python"
tweets = [u'Long Integer Objects \u2014 Python v2.7.2 documentation',
u'Yesssssss..... RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK',
u'RT @chavezbeatthose: Pull out the python && hit it while my wifes gone',
u'pyhp\u3082\u304d\u306b\u306a\u308b\u3002 "Python Hypertext Processr" #Europython',
u'RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK',
u'RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK',
u'RT @mtlpy: Montr\xe9al-Python 22: Gabriel Tremblay on Practical device firmware reverse engineering using Python',
u'Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK',
u"@mccrmx I used bzr too and liked it, but I have to admit you'll do more googling on obscure python errors. bzr is good, but so is git",
u'RT @hugofortier: Reverse Engineering and Security this month @ Python Montreal Meeting',
u'Reverse Engineering and Security this month @ Python Montreal Meeting',
u"@mitsuhiko hrm, good to know you didn't touch on anything new in python 3. all sorts of intriguing goodies in there.",
u"@HushedxAngel I KNOW!!! our science teacher has the basic license. OH! we have a diamond python now as well!! It's forever expanding",
u'Python from Scratch: Variables, Data Types and Control Structure ...',
u'Vibe Monty Python.',
u'The #PSF have given a community service award to @tarek_ziade. Hard to imagine a more deserving recipient; Python packaging FTW!',
u'RT @developerworks: Writing clean, testable, high quality #code in Python - Elegant and powerful #Python > #Linux #UNIX #programming',
u'RT @pypi: PluginIndexes.DateDateIndex .1: Date Index for ZCatalog that ignores time[zones]',
u"I probably should spend a bit more time actually learning python before I try doing shit with it but I'm content with what I've learned",
u'Hope for Fatherhood: Surgical Technique Mines for Hidden Sperm']
#stoplist and words that occur once
stoplist = set("for a of the and to in you're should have will do it's this i your is what".split())
allTokens = ''.join(tweets)
tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1)
#split each text on whitespace, filtering out stopwords
split_tweets = [[word for word in tweet.lower().split()
if word not in stoplist and word not in tokensOnce]
for tweet in tweets]
#map words in corpus to integer IDs, record word frequency
dictionary = corpora.Dictionary(split_tweets)
#convert texts to term vectors via dictionary
corpus = [dictionary.doc2bow(st) for st in split_tweets]
#initialize model
tfidf = models.TfidfModel(corpus)
#use model to transform term vectors to tfidf vectors
corpus_tfidf = tfidf[corpus]
#lsi model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lsi_vecs = lsi[corpus_tfidf]
for vec in lsi_vecs:
print vec
