Created
December 11, 2011 20:15
-
-
Save andreiolariu/1462504 to your computer and use it in GitHub Desktop.
Histogram and keyword detection for tweets during Real Madrid - Barcelona match
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# more info at http://webmining.olariu.org/el-clasico-on-twitter | |
# this code is designed to be run in ipython | |
import urllib, urllib2, time, threading, Queue, re | |
from datetime import datetime | |
import simplejson as json | |
import matplotlib.pyplot as plt | |
import numpy as np | |
KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona', | |
'forcabarca', 'el clasico', 'elclasico'] | |
def fetch_url(url, get=None, post=None): | |
user_agent = 'Andrei Olariu\'s Web Mining for Dummies' | |
headers = {'User-Agent': user_agent} | |
if get: | |
data = urllib.urlencode(get) | |
url = "%s?%s" % (url, data) | |
req = urllib2.Request(url, post, headers) | |
try: | |
response = urllib2.urlopen(req).read() | |
response = json.loads(response) | |
except Exception, e: | |
print 'error in reading %s: %s' % (url, e) | |
return None | |
return response | |
def fetch_tweets(): | |
url = 'http://search.twitter.com/search.json' | |
values = { | |
'count' : 100, | |
'q' : ' OR '.join(KEYWORDS) + ' -rt', | |
'rpp': 100, | |
'page': 1, | |
'result_type': 'recent', | |
'with_twitter_user_id': 'true', | |
'lang': 'en', | |
} | |
response = fetch_url(url, values) | |
if response and 'results' in response: | |
return response['results'] | |
else: | |
return [] | |
def monitor_twitter(): | |
start_at = time.time() | |
id_cache = set([]) | |
while keep_monitoring: | |
batch = fetch_tweets() | |
batch = [t for t in batch if t['id'] not in id_cache] | |
print len(batch) | |
id_cache.update([t['id'] for t in batch]) | |
for t in batch: | |
queue.put(t) | |
time.sleep(interval) | |
# Start monitoring Twitter | |
# Use a thread and a queue to save tweets - easier to work with in ipython | |
queue = Queue.Queue() | |
keep_monitoring = True | |
interval = 30 | |
threading.Thread(target = monitor_twitter).start() | |
# Go watch the match and check every once in a while to see if I should | |
# adjust the interval | |
# Stop monitoring | |
keep_monitoring = False | |
# Get the tweets into a list | |
tweets = [] | |
while queue.qsize(): | |
tweets.append(queue.get()) | |
# Index tweets by minute relative to the start of the match | |
minutes = {} | |
start_of_match = datetime(2011, 12, 10, 21, 00, 00) | |
for t in tweets: | |
t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000") | |
minute = int((t_time - start_of_match).total_seconds() / 60) | |
if minute not in minutes: | |
minutes[minute] = [] | |
minutes[minute].append(t['text'].lower()) | |
# Build the histogram for tweet volume | |
hist = [] | |
# ... and the x-axis labels (minutes relative to start of match) | |
xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1) | |
for i in xaxis: | |
hist.append(len(minutes.get(i, []))) | |
# Build a dictionary {word: frequency} over all tweets | |
words = {} | |
for texts in minutes.itervalues(): | |
for text in texts: | |
for w in re.findall('\w+', text): | |
if len(w) > 3: | |
words[w] = words.get(w, 0) + 1 | |
# Convert it into a dictionary {word: word_index} and discard rare words | |
i = 0 | |
temp = {} | |
for w, f in words.iteritems(): | |
if f > 25: | |
temp[w] = i | |
i += 1 | |
words = temp | |
# Build a matrix where every element [i,j] is the number of times | |
# the word having word_index=j appears during the time interval | |
# having index i | |
# Every time interval is 5 minutes long | |
counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))] | |
for start in xaxis[5::5]: | |
index = xaxis.index(start) / 5 - 1 | |
for i in range(start - 5, start): | |
for text in minutes.get(i, []): | |
for w in re.findall('\w+', text): | |
if w in words: | |
counts[index][words[w]] += 1 | |
# Convert the dictionary of {word: word_index} into a list of words, | |
# where words[word_index] = word | |
temp = [0] * len(words) | |
for w, i in words.iteritems(): | |
temp[i] = w | |
words = temp | |
# Convert the matrix of counts to numpy | |
counts = np.matrix(counts) | |
# .. in order to easily compute means and standard deviations | |
# for every set of word frequencies | |
means = [counts[:,i].mean() for i in xrange(counts.shape[1])] | |
stds = [counts[:,i].std() for i in xrange(counts.shape[1])] | |
# Find spikes in word frequencies | |
# Computes a score = number of standard deviations from mean | |
awesome = {} | |
max_score = 0 | |
for i in xrange(counts.shape[0]): | |
for j in xrange(counts.shape[0]): | |
score = (counts[i,j] - means[j]) / stds[j] | |
if score > 3.5: | |
x = xaxis[(i + 1) * 5] | |
max_score = max(max_score, score) | |
if x not in awesome: | |
awesome[x] = [] | |
awesome[x].append((words[j], score)) | |
# Create the chart | |
plt.figure() | |
# Plot the histogram | |
plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4) | |
plt.xlabel('Minutes from match start') | |
plt.ylabel('Tweets per minute') | |
plt.xticks([i for i in xaxis if i % 10 == 0]) | |
# Plot the first+second half spans | |
plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2) | |
plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2) | |
# Plot the goal lines | |
plt.axvline(x=1, color='r') | |
plt.axvline(x=30, color='r') | |
plt.axvline(x=53 + 2 + 15, color='r') | |
plt.axvline(x=66 + 2 + 15, color='r') | |
# Plot the words, based on the time when they spiked | |
for x, x_words in awesome.iteritems(): | |
y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)]) | |
y = min(y, 50) | |
for pair in x_words: | |
word, score = pair | |
alpha = (score - 2) / (max_score - 2) | |
plt.text(x - 8, y, word, alpha=alpha, rotation=45) | |
y += 9 | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment