Created
February 12, 2012 08:59
-
-
Save andreiolariu/1807396 to your computer and use it in GitHub Desktop.
uberVU hackaton - Twitter Tagcloud for Oscar Best Movie Nominees
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# more info here: http://webmining.olariu.org/the-story-of-the-oscar-predictions | |
import urllib, urllib2, re | |
import json | |
from time import time | |
# using this POS tagger: | |
# http://jasonwiener.com/2006/01/20/simple-nlp-part-of-speech-tagger-in-python/ | |
import NLPlib | |
def fetch_url(url, get=None, post=None): | |
user_agent = 'Andrei Olariu\'s Web Mining for Dummies' | |
headers = {'User-Agent': user_agent} | |
if get: | |
data = urllib.urlencode(get) | |
url = "%s?%s" % (url, data) | |
print url | |
req = urllib2.Request(url, post, headers) | |
try: | |
response = urllib2.urlopen(req).read() | |
response = json.loads(response) | |
except Exception, e: | |
print 'error in reading %s: %s' % (url, e) | |
return None | |
return response | |
def get_tweets(values): | |
''' | |
do a series of api calls at ubervu's api to get all | |
tweets matching the filtering options | |
''' | |
url = 'http://api.contextvoice.com/1.2/mentions/search/' | |
data = [] | |
val = time() | |
while True: | |
response = fetch_url(url, values) | |
if not response or response['total'] == 0: | |
break | |
data.extend(response['results']) | |
val = min([t['published'] for t in response['results']]) | |
values.update({ | |
'until': val - 1, | |
}) | |
return data | |
def tokenize(text): | |
''' | |
given a text, returns a list of words | |
handle twitter specific tokens | |
''' | |
text = text.lower() | |
# Remove email adresses | |
text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', '', text) | |
# Remove twitter user names | |
text = re.sub(r'(\A|\s)@(\w+)', r'\1', text) | |
# Remove urls | |
text = re.sub(r'\w+:\/\/\S+', r'', text) | |
# Remove repeated (3+) letters: cooool --> cool, niiiiice --> niice | |
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text) | |
# Do it again in case we have coooooooollllllll --> cooll | |
text = re.sub(r'([a-zA-Z])\1\1+(\w*)', r'\1\1\2', text) | |
words = re.findall('\w+', text) | |
return words | |
# get tweets | |
today = int(time()) | |
tweets = {} | |
values = { | |
'since': today - 7 * 86400, | |
'until': today, | |
'generator':'twitter', | |
'format':'json', | |
'language':'english', | |
'apikey': 'you\'l have to get your own', | |
'count': 100 | |
} | |
# the keywords tracked by ubervu | |
keywords = { | |
'moneyball': "Moneyball movie OR oscar OR picture OR film", | |
'hugo': "Hugo movie OR oscar OR picture OR film OR animation", | |
'treelife': '"Tree of Life" movie OR oscar OR picture OR film', | |
'midnight': '"Midnight in Paris" movie OR oscar OR picture OR film', | |
'warhorse': '"War Horse" movie OR oscar OR picture OR film', | |
'artist': '"The Artist" movie OR oscar OR picture OR film', | |
'descendants': '"The Descendants" movie OR oscar OR picture OR film', | |
'help': '"The help" movie OR oscar OR picture OR film', | |
'loud': 'extremely loud incredibly close movie OR oscar OR picture OR film', | |
} | |
for movie, query in keywords.iteritems(): | |
values.update({ | |
'q': query, | |
'until': today | |
}) | |
tweets.update({movie: get_tweets(values)}) | |
# get frequencies for words | |
freq = {} | |
for movie, tweet_list in tweets.iteritems(): | |
f = {} | |
for tweet in tweet_list: | |
words = tokenize(tweet['content']) | |
for i in xrange(len(words)): | |
f[words[i]] = f.get(words[i], 0) + 1 | |
freq[movie] = f | |
# build probabilities P(word|movie) | |
# acts as normalisation | |
prob = {} | |
min_frequency = 8 | |
for movie, frequencies in freq.iteritems(): | |
for word, frequency in frequencies.iteritems(): | |
if frequency >= min_frequency: | |
if word not in prob: | |
prob[word] = {} | |
prob[word][movie] = frequency * 100.0 / len(tweets[movie]) | |
# invert probabilities | |
# from P(word|movie) build P(movie|word) using Bayes' theorem | |
# keep only words with the above probability over 55% | |
top_words = [] | |
min_probability = 55 | |
for word, f in prob.iteritems(): | |
s = 0 | |
maxmovie = '' | |
maxprob = 0 | |
for movie, p in f.iteritems(): | |
s += p | |
if maxp < p: | |
maxp = p | |
maxmovie = movie | |
d = maxp * 100 / s | |
if d > min_probability: | |
top_words.append((word, maxmovie, d)) | |
top_words.sort(key=lambda x: -x[2]) | |
# at this point I noticed a lot of noise, so I decided to keep only adjectives | |
tagger = NLPlib() | |
adjectives = [] | |
for e in top_words: | |
if tagger.tag(e[0:1]) == ['JJ']: | |
adjectives.append(e) | |
# generate data to use in wordle.net | |
data = {} | |
wordle_threshold = 50 | |
for e in adjectives: | |
if e[1] not in data: | |
data[e[1]] = '' | |
data[e[1]] += '%s:%s\n' % (e[0], int(e[2] - wordle_threshold)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment