Created
June 21, 2011 04:44
-
-
Save cdfox/1037261 to your computer and use it in GitHub Desktop.
Twitter LSI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import urllib | |
import urllib2 | |
from gensim import corpora, models, similarities | |
import logging | |
import sys | |
# First 20 Twitter search results for "python" | |
tweets = [u'Long Integer Objects \u2014 Python v2.7.2 documentation http://bit.ly/jEzUVi', | |
u'Yesssssss..... RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK', | |
u'RT @chavezbeatthose: Pull out the python && hit it while my wifes gone', | |
u'pyhp\u3082\u304d\u306b\u306a\u308b\u3002 "Python Hypertext Processr" http://www.pyhp.org/ #Europython', | |
u'RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK', | |
u'RT @TheRealPython: Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK', | |
u'RT @mtlpy: Montr\xe9al-Python 22: Gabriel Tremblay on Practical device firmware reverse engineering using Python', | |
u'Dear #Teambigdick #teamtop #teamverse #teambottom #teamwetpussy #MANDINGO Python is BACK', | |
u"@mccrmx I used bzr too and liked it, but I have to admit you'll do more googling on obscure python errors. bzr is good, but so is git", | |
u'RT @hugofortier: Reverse Engineering and Security this month @ Python Montreal Meeting http://t.co/fQdMyFn', | |
u'Reverse Engineering and Security this month @ Python Montreal Meeting http://t.co/fQdMyFn', | |
u"@mitsuhiko hrm, good to know you didn't touch on anything new in python 3. all sorts of intriguing goodies in there.", | |
u"@HushedxAngel I KNOW!!! our science teacher has the basic license. OH! we have a diamond python now as well!! It's forever expanding", | |
u'Python from Scratch: Variables, Data Types and Control Structure ... http://t.co/0i5XRjI', | |
u'Vibe Monty Python.', | |
u'The #PSF have given a community service award to @tarek_ziade. Hard to imagine a more deserving recipient; Python packaging FTW!', | |
u'RT @developerworks: Writing clean, testable, high quality #code in Python - Elegant and powerful #Python > http://su.pr/9Qdm5g #Linux #UNIX #programming', | |
u'RT @pypi: PluginIndexes.DateDateIndex .1: Date Index for ZCatalog that ignores time[zones] http://bit.ly/myZAZM', | |
u"I probably should spend a bit more time actually learning python before I try doing shit with it but I'm content with what I've learned", | |
u'Hope for Fatherhood: Surgical Technique Mines for Hidden Sperm http://t.co/txTLVje'] | |
#stoplist and words that occur once | |
stoplist = set("for a of the and to in you're should have will do it's this i your is what".split()) | |
allTokens = ''.join(tweets) | |
tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1) | |
#split each text on whitespace, filtering out stopwords | |
split_tweets = [[word for word in tweet.lower().split() | |
if word not in stoplist and word not in tokensOnce] | |
for tweet in tweets] | |
#map words in corpus to integer IDs, record word frequency | |
dictionary = corpora.Dictionary(split_tweets) | |
#convert texts to term vectors via dictionary | |
corpus = [dictionary.doc2bow(st) for st in split_tweets] | |
#initialize model | |
tfidf = models.TfidfModel(corpus) | |
#use model to transform term vectors to tfidf vectors | |
corpus_tfidf = tfidf[corpus] | |
#lsi model | |
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) | |
lsi_vecs = lsi[corpus_tfidf] | |
for vec in lsi_vecs: | |
print vec |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment