Skip to content

Instantly share code, notes, and snippets.

@andreiolariu
Created December 11, 2011 20:15
Show Gist options
  • Save andreiolariu/1462504 to your computer and use it in GitHub Desktop.
Save andreiolariu/1462504 to your computer and use it in GitHub Desktop.
Histogram and keyword detection for tweets during Real Madrid - Barcelona match
# more info at http://webmining.olariu.org/el-clasico-on-twitter
# this code is designed to be run in ipython
import urllib, urllib2, time, threading, Queue, re
from datetime import datetime
import simplejson as json
import matplotlib.pyplot as plt
import numpy as np
KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona',
'forcabarca', 'el clasico', 'elclasico']
def fetch_url(url, get=None, post=None):
user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
headers = {'User-Agent': user_agent}
if get:
data = urllib.urlencode(get)
url = "%s?%s" % (url, data)
req = urllib2.Request(url, post, headers)
try:
response = urllib2.urlopen(req).read()
response = json.loads(response)
except Exception, e:
print 'error in reading %s: %s' % (url, e)
return None
return response
def fetch_tweets():
url = 'http://search.twitter.com/search.json'
values = {
'count' : 100,
'q' : ' OR '.join(KEYWORDS) + ' -rt',
'rpp': 100,
'page': 1,
'result_type': 'recent',
'with_twitter_user_id': 'true',
'lang': 'en',
}
response = fetch_url(url, values)
if response and 'results' in response:
return response['results']
else:
return []
def monitor_twitter():
start_at = time.time()
id_cache = set([])
while keep_monitoring:
batch = fetch_tweets()
batch = [t for t in batch if t['id'] not in id_cache]
print len(batch)
id_cache.update([t['id'] for t in batch])
for t in batch:
queue.put(t)
time.sleep(interval)
# Start monitoring Twitter
# Use a thread and a queue to save tweets - easier to work with in ipython
queue = Queue.Queue()
keep_monitoring = True
interval = 30
threading.Thread(target = monitor_twitter).start()
# Go watch the match and check every once in a while to see if I should
# adjust the interval
# Stop monitoring
keep_monitoring = False
# Get the tweets into a list
tweets = []
while queue.qsize():
tweets.append(queue.get())
# Index tweets by minute relative to the start of the match
minutes = {}
start_of_match = datetime(2011, 12, 10, 21, 00, 00)
for t in tweets:
t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000")
minute = int((t_time - start_of_match).total_seconds() / 60)
if minute not in minutes:
minutes[minute] = []
minutes[minute].append(t['text'].lower())
# Build the histogram for tweet volume
hist = []
# ... and the x-axis labels (minutes relative to start of match)
xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1)
for i in xaxis:
hist.append(len(minutes.get(i, [])))
# Build a dictionary {word: frequency} over all tweets
words = {}
for texts in minutes.itervalues():
for text in texts:
for w in re.findall('\w+', text):
if len(w) > 3:
words[w] = words.get(w, 0) + 1
# Convert it into a dictionary {word: word_index} and discard rare words
i = 0
temp = {}
for w, f in words.iteritems():
if f > 25:
temp[w] = i
i += 1
words = temp
# Build a matrix where every element [i,j] is the number of times
# the word having word_index=j appears during the time interval
# having index i
# Every time interval is 5 minutes long
counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))]
for start in xaxis[5::5]:
index = xaxis.index(start) / 5 - 1
for i in range(start - 5, start):
for text in minutes.get(i, []):
for w in re.findall('\w+', text):
if w in words:
counts[index][words[w]] += 1
# Convert the dictionary of {word: word_index} into a list of words,
# where words[word_index] = word
temp = [0] * len(words)
for w, i in words.iteritems():
temp[i] = w
words = temp
# Convert the matrix of counts to numpy
counts = np.matrix(counts)
# .. in order to easily compute means and standard deviations
# for every set of word frequencies
means = [counts[:,i].mean() for i in xrange(counts.shape[1])]
stds = [counts[:,i].std() for i in xrange(counts.shape[1])]
# Find spikes in word frequencies
# Computes a score = number of standard deviations from mean
awesome = {}
max_score = 0
for i in xrange(counts.shape[0]):
for j in xrange(counts.shape[0]):
score = (counts[i,j] - means[j]) / stds[j]
if score > 3.5:
x = xaxis[(i + 1) * 5]
max_score = max(max_score, score)
if x not in awesome:
awesome[x] = []
awesome[x].append((words[j], score))
# Create the chart
plt.figure()
# Plot the histogram
plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4)
plt.xlabel('Minutes from match start')
plt.ylabel('Tweets per minute')
plt.xticks([i for i in xaxis if i % 10 == 0])
# Plot the first+second half spans
plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2)
plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2)
# Plot the goal lines
plt.axvline(x=1, color='r')
plt.axvline(x=30, color='r')
plt.axvline(x=53 + 2 + 15, color='r')
plt.axvline(x=66 + 2 + 15, color='r')
# Plot the words, based on the time when they spiked
for x, x_words in awesome.iteritems():
y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)])
y = min(y, 50)
for pair in x_words:
word, score = pair
alpha = (score - 2) / (max_score - 2)
plt.text(x - 8, y, word, alpha=alpha, rotation=45)
y += 9
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment