-
-
Save cdfox/1030916 to your computer and use it in GitHub Desktop.
Applying LSI to Twitter search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import urllib | |
import urllib2 | |
from gensim import corpora, models, similarities | |
import logging | |
import sys | |
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas | |
from matplotlib.figure import Figure | |
logging.root.setLevel(logging.INFO) # will suppress DEBUG level events | |
def main(): | |
num_tweets = sys.argv[1] | |
query = sys.argv[2] | |
num_lsi_topics = int(sys.argv[3]) | |
threshold = float(sys.argv[4]) | |
lsi_twitter(num_tweets, query, num_lsi_topics, threshold) | |
def lsi_twitter(num_tweets, query, num_lsi_topics, threshold): | |
tweets = twitter_search(num_tweets, query) | |
run_lsi(tweets, num_lsi_topics, threshold) | |
def twitter_search(num_tweets, query): | |
trends_url = 'http://search.twitter.com/search.json' | |
params = urllib.urlencode({ | |
'rpp': num_tweets, | |
'q': query, | |
'lang': 'en', | |
}) | |
url = '%s?%s' % (trends_url, params) | |
response = urllib2.urlopen(url, timeout=10) | |
content = response.read() | |
data = json.loads(content) | |
tweets = [r['text'] for r in data['results']] | |
return tweets | |
def run_lsi(tweets, num_lsi_topics, threshold): | |
#stoplist and words that occur once | |
stoplist = set("for a of the and to in you're should have will do it's this i your is what".split()) | |
allTokens = ''.join(tweets) | |
tokensOnce = set(word for word in set(allTokens) if allTokens.count(word) == 1) | |
#split each text on whitespace, filtering out some stopwords | |
split_tweets = [[word for word in tweet.lower().split() | |
if word not in stoplist and word not in tokensOnce] | |
for tweet in tweets] | |
#map words in corpus to integer IDs, record word frequency | |
dictionary = corpora.Dictionary(split_tweets) | |
#dictionary.save('twitter.dict') # store the dictionary, for future reference | |
#convert texts to term vectors via dictionary | |
corpus = [dictionary.doc2bow(st) for st in split_tweets] | |
#corpora.MmCorpus.serialize('twitter.mm', corpus) # store to disk, for later use | |
#initialize model | |
tfidf = models.TfidfModel(corpus) | |
#use model to transform term vectors to tfidf vectors | |
corpus_tfidf = tfidf[corpus] | |
#two topic lsi model | |
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, numTopics=num_lsi_topics) | |
lsi_vecs = lsi[corpus_tfidf] | |
vecs_by_topic = [[] for i in range(num_lsi_topics)] | |
#group vecs by max projection along a topic axis, | |
#discarding the ones near zero | |
for i, vec in enumerate(lsi_vecs): | |
(max_topic, max_weight) = max(vec, key=lambda (topic, weight): abs(weight)) | |
if abs(max_weight) > threshold: | |
vecs_by_topic[max_topic].append((tweets[i], vec, max_weight)) | |
for topic, results in enumerate(vecs_by_topic): | |
print '\nTopic %i:\n' % (topic) | |
results.sort(key=lambda (tweet, vec, max_weight): max_weight) | |
for result in results: | |
print result[0] | |
print result[1] | |
print '' | |
#make plot | |
if num_lsi_topics == 2: | |
fig = Figure(figsize=(6.0,6.0)) | |
canvas = FigureCanvas(fig) | |
ax = fig.add_subplot(111) | |
xs = [vec[0][1] for vec in lsi_vecs] | |
ys = [vec[1][1] for vec in lsi_vecs] | |
ax.scatter(xs, ys) | |
ax.set_title('Twitter LSI') | |
ax.grid(True) | |
ax.set_xlabel('topic 0') | |
ax.set_ylabel('topic 1') | |
canvas.print_figure('twitter_lsi') | |
def print_tfidf(tweets, corpus_tfidf, dictionary): | |
#convert from TfidfModel to list of tuples | |
tfidf_list = [doc for doc in corpus_tfidf] | |
#get list of (key,value) pairs from dictionary to look-up terms by id | |
dict_items = dictionary.token2id.items() | |
#find first key in the dictionary with value term_id | |
def id2token(term_id): | |
return filter(lambda (k,v): v == term_id, dict_items)[0][0] | |
for i in range(len(tweets)): | |
print 'Result %i: %s' % (i,tweets[i]) | |
tfidf_vec = tfidf_list[i] | |
tfidf_vec.sort(key=lambda term: term[1]) | |
#top_terms = tfidf_vec[:5] | |
for term in tfidf_vec: #top_terms: | |
print '%s: %f' % (id2token(term[0]),term[1]) | |
print '' | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i tried but why it failed?