andreiolariu · December 11, 2011 20:15
diff --git a/elclasico.py b/elclasico.py
 # more info at http://webmining.olariu.org/el-clasico-on-twitter
 # this code is designed to be run in ipython

 import urllib, urllib2, time, threading, Queue, re
 from datetime import datetime

 import simplejson as json
 import matplotlib.pyplot as plt
 import numpy as np

 KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona', 
        'forcabarca', 'el clasico', 'elclasico']

 def fetch_url(url, get=None, post=None):
    user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
    headers = {'User-Agent': user_agent}
    if get:
        data = urllib.urlencode(get)
        url = "%s?%s" % (url, data)
    req = urllib2.Request(url, post, headers)
    try:
        response = urllib2.urlopen(req).read()
        response = json.loads(response)
    except Exception, e:
        print 'error in reading %s: %s' % (url, e)
        return None
    return response

 def fetch_tweets():
    url = 'http://search.twitter.com/search.json'
    values = {
            'count' : 100,
            'q' : ' OR '.join(KEYWORDS) + ' -rt',
            'rpp': 100,
            'page': 1,
            'result_type': 'recent',
            'with_twitter_user_id': 'true',
            'lang': 'en',
    }
    response = fetch_url(url, values)
    if response and 'results' in response:
        return response['results']
    else:
        return []
        
 def monitor_twitter():
    start_at = time.time()
    id_cache = set([])
    while keep_monitoring:
        batch = fetch_tweets()
        batch = [t for t in batch if t['id'] not in id_cache]
        print len(batch)
        id_cache.update([t['id'] for t in batch])
        for t in batch:
            queue.put(t)
        time.sleep(interval)
 
 # Start monitoring Twitter
 # Use a thread and a queue to save tweets - easier to work with in ipython
 queue = Queue.Queue()
 keep_monitoring = True
 interval = 30
 threading.Thread(target = monitor_twitter).start()

 # Go watch the match and check every once in a while to see if I should
 # adjust the interval

 # Stop monitoring
 keep_monitoring = False

 # Get the tweets into a list
 tweets = []
 while queue.qsize():
    tweets.append(queue.get())
    
 # Index tweets by minute relative to the start of the match
 minutes = {}
 start_of_match = datetime(2011, 12, 10, 21, 00, 00)
 for t in tweets:
    t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000")
    minute = int((t_time - start_of_match).total_seconds() / 60)
    if minute not in minutes:
        minutes[minute] = []
    minutes[minute].append(t['text'].lower())
    
 # Build the histogram for tweet volume
 hist = []
 # ... and the x-axis labels (minutes relative to start of match)
 xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1)
 for i in xaxis:
    hist.append(len(minutes.get(i, [])))

 # Build a dictionary {word: frequency} over all tweets
 words = {}
 for texts in minutes.itervalues():
    for text in texts:
        for w in re.findall('\w+', text):
            if len(w) > 3:
                words[w] = words.get(w, 0) + 1

 # Convert it into a dictionary {word: word_index} and discard rare words
 i = 0
 temp = {}
 for w, f in words.iteritems():
    if f > 25:
        temp[w] = i
        i += 1
 words = temp

 # Build a matrix where every element [i,j] is the number of times
 # the word having word_index=j appears during the time interval 
 # having index i
 # Every time interval is 5 minutes long
 counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))]
 for start in xaxis[5::5]:
    index = xaxis.index(start) / 5 - 1
    for i in range(start - 5, start):
        for text in minutes.get(i, []):
            for w in re.findall('\w+', text):
                if w in words:
                    counts[index][words[w]] += 1

 # Convert the dictionary of {word: word_index} into a list of words, 
 # where words[word_index] = word
 temp = [0] * len(words)
 for w, i in words.iteritems():
    temp[i] = w
 words = temp

 # Convert the matrix of counts to numpy
 counts = np.matrix(counts)
 # .. in order to easily compute means and standard deviations
 # for every set of word frequencies
 means = [counts[:,i].mean() for i in xrange(counts.shape[1])]
 stds = [counts[:,i].std() for i in xrange(counts.shape[1])]

 # Find spikes in word frequencies
 # Computes a score = number of standard deviations from mean
 awesome = {}
 max_score = 0
 for i in xrange(counts.shape[0]):
    for j in xrange(counts.shape[0]):
        score = (counts[i,j] - means[j]) / stds[j]
        if score > 3.5:
            x = xaxis[(i + 1) * 5]
            max_score = max(max_score, score)
            if x not  in awesome:
                awesome[x] = []
            awesome[x].append((words[j], score))

 # Create the chart
 plt.figure()
 # Plot the histogram
 plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4)
 plt.xlabel('Minutes from match start')
 plt.ylabel('Tweets per minute')
 plt.xticks([i for i in xaxis if i % 10 == 0])
 # Plot the first+second half spans
 plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2)
 plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2)
 # Plot the goal lines
 plt.axvline(x=1, color='r')
 plt.axvline(x=30, color='r')
 plt.axvline(x=53 + 2 + 15, color='r')
 plt.axvline(x=66 + 2 + 15, color='r')
 # Plot the words, based on the time when they spiked
 for x, x_words in awesome.iteritems():
    y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)])
    y = min(y, 50)
    for pair in x_words:
        word, score = pair
        alpha = (score - 2) / (max_score - 2)
        plt.text(x - 8, y, word, alpha=alpha, rotation=45)
        y += 9
 plt.show()
	# more info at http://webmining.olariu.org/el-clasico-on-twitter
	# this code is designed to be run in ipython

	import urllib, urllib2, time, threading, Queue, re
	from datetime import datetime

	import simplejson as json
	import matplotlib.pyplot as plt
	import numpy as np

	KEYWORDS = ['realmadrid', 'real madrid', 'fcbarcelona', 'barcelona',
	'forcabarca', 'el clasico', 'elclasico']

	def fetch_url(url, get=None, post=None):
	user_agent = 'Andrei Olariu\'s Web Mining for Dummies'
	headers = {'User-Agent': user_agent}
	if get:
	data = urllib.urlencode(get)
	url = "%s?%s" % (url, data)
	req = urllib2.Request(url, post, headers)
	try:
	response = urllib2.urlopen(req).read()
	response = json.loads(response)
	except Exception, e:
	print 'error in reading %s: %s' % (url, e)
	return None
	return response

	def fetch_tweets():
	url = 'http://search.twitter.com/search.json'
	values = {
	'count' : 100,
	'q' : ' OR '.join(KEYWORDS) + ' -rt',
	'rpp': 100,
	'page': 1,
	'result_type': 'recent',
	'with_twitter_user_id': 'true',
	'lang': 'en',
	}
	response = fetch_url(url, values)
	if response and 'results' in response:
	return response['results']
	else:
	return []

	def monitor_twitter():
	start_at = time.time()
	id_cache = set([])
	while keep_monitoring:
	batch = fetch_tweets()
	batch = [t for t in batch if t['id'] not in id_cache]
	print len(batch)
	id_cache.update([t['id'] for t in batch])
	for t in batch:
	queue.put(t)
	time.sleep(interval)

	# Start monitoring Twitter
	# Use a thread and a queue to save tweets - easier to work with in ipython
	queue = Queue.Queue()
	keep_monitoring = True
	interval = 30
	threading.Thread(target = monitor_twitter).start()

	# Go watch the match and check every once in a while to see if I should
	# adjust the interval

	# Stop monitoring
	keep_monitoring = False

	# Get the tweets into a list
	tweets = []
	while queue.qsize():
	tweets.append(queue.get())

	# Index tweets by minute relative to the start of the match
	minutes = {}
	start_of_match = datetime(2011, 12, 10, 21, 00, 00)
	for t in tweets:
	t_time = datetime.strptime(t['created_at'], "%a, %d %b %Y %H:%M:%S +0000")
	minute = int((t_time - start_of_match).total_seconds() / 60)
	if minute not in minutes:
	minutes[minute] = []
	minutes[minute].append(t['text'].lower())

	# Build the histogram for tweet volume
	hist = []
	# ... and the x-axis labels (minutes relative to start of match)
	xaxis = range(min(minutes.keys()), max(minutes.keys()) + 1)
	for i in xaxis:
	hist.append(len(minutes.get(i, [])))

	# Build a dictionary {word: frequency} over all tweets
	words = {}
	for texts in minutes.itervalues():
	for text in texts:
	for w in re.findall('\w+', text):
	if len(w) > 3:
	words[w] = words.get(w, 0) + 1

	# Convert it into a dictionary {word: word_index} and discard rare words
	i = 0
	temp = {}
	for w, f in words.iteritems():
	if f > 25:
	temp[w] = i
	i += 1
	words = temp

	# Build a matrix where every element [i,j] is the number of times
	# the word having word_index=j appears during the time interval
	# having index i
	# Every time interval is 5 minutes long
	counts = [[0.0] * len(words) for i in xrange(len(xaxis[5::5]))]
	for start in xaxis[5::5]:
	index = xaxis.index(start) / 5 - 1
	for i in range(start - 5, start):
	for text in minutes.get(i, []):
	for w in re.findall('\w+', text):
	if w in words:
	counts[index][words[w]] += 1

	# Convert the dictionary of {word: word_index} into a list of words,
	# where words[word_index] = word
	temp = [0] * len(words)
	for w, i in words.iteritems():
	temp[i] = w
	words = temp

	# Convert the matrix of counts to numpy
	counts = np.matrix(counts)
	# .. in order to easily compute means and standard deviations
	# for every set of word frequencies
	means = [counts[:,i].mean() for i in xrange(counts.shape[1])]
	stds = [counts[:,i].std() for i in xrange(counts.shape[1])]

	# Find spikes in word frequencies
	# Computes a score = number of standard deviations from mean
	awesome = {}
	max_score = 0
	for i in xrange(counts.shape[0]):
	for j in xrange(counts.shape[0]):
	score = (counts[i,j] - means[j]) / stds[j]
	if score > 3.5:
	x = xaxis[(i + 1) * 5]
	max_score = max(max_score, score)
	if x not in awesome:
	awesome[x] = []
	awesome[x].append((words[j], score))

	# Create the chart
	plt.figure()
	# Plot the histogram
	plt.fill_between(xaxis, hist, [0] * len(xaxis), facecolor='b', alpha=0.4)
	plt.xlabel('Minutes from match start')
	plt.ylabel('Tweets per minute')
	plt.xticks([i for i in xaxis if i % 10 == 0])
	# Plot the first+second half spans
	plt.axvspan(0, 45 + 2, facecolor='0.5', alpha=0.2)
	plt.axvspan(45 + 2 + 15, 90 + 15 + 5, facecolor='0.5', alpha=0.2)
	# Plot the goal lines
	plt.axvline(x=1, color='r')
	plt.axvline(x=30, color='r')
	plt.axvline(x=53 + 2 + 15, color='r')
	plt.axvline(x=66 + 2 + 15, color='r')
	# Plot the words, based on the time when they spiked
	for x, x_words in awesome.iteritems():
	y = max(hist[xaxis.index(x) - 5 : xaxis.index(x)])
	y = min(y, 50)
	for pair in x_words:
	word, score = pair
	alpha = (score - 2) / (max_score - 2)
	plt.text(x - 8, y, word, alpha=alpha, rotation=45)
	y += 9
	plt.show()