fnielsen · June 12, 2012 17:33
diff --git a/Nielsen2012Numpy_quotes.py b/Nielsen2012Numpy_quotes.py
 import urllib, urllib2
 import simplejson as json
 import dateutil.parser
 import datetime
 import matplotlib.dates
 import matplotlib.finance
 from matplotlib import pyplot as plt
 import nltk.corpus 
 import numpy as np
 import re
 import copy


 companies = {
    'Novo Nordisk': {'stock': 'NVO', 'wikipedia': 'Novo_Nordisk'},
    'Pfizer': {'stock': 'PFE', 'wikipedia': 'Pfizer'} 
    }


 filebase = '/home/fn/'

 # Sentiment word list
 # AFINN-111 is as of June 2011 the most recent version of AFINN
 filename_afinn = filebase + '/data/AFINN/AFINN-111.txt'
 afinn = dict(map(lambda (w, s): (unicode(w, 'utf-8'), int(s)), [ 
            ws.strip().split('\t') for ws in open(filename_afinn) ]))

 stopwords = nltk.corpus.stopwords.words('english')
 stopwords = dict(zip(stopwords, stopwords))


 # Word splitter pattern
 pattern_split = re.compile(r"[^\w-]+", re.UNICODE)

 def sentiment(text, norm='sqrt'):
    """
    Sentiment analysis.
    (sentiment, arousal, ambivalence, positive, negative) = sentiment(test)
    """
    words_with_stopwords = pattern_split.split(text.lower())
    # Exclude stopwords:
    words = filter(lambda w: not stopwords.has_key(w), words_with_stopwords)
    sentiments = map(lambda word: afinn.get(word, 0), words)
    keys = ['sentiment', 'arousal', 'ambivalence', 'positive', 'negative']
    if sentiments:
        sentiments = np.asarray(sentiments).astype(float)
        sentiment = np.sum(sentiments)
        arousal = np.sum(np.abs(sentiments))
        ambivalence = arousal - np.abs(sentiment)
        positive = np.sum(np.where(sentiments>0, sentiments, 0))
        negative = - np.sum(np.where(sentiments<0, sentiments, 0))
        result = np.asarray([sentiment, arousal, ambivalence, positive, negative])
        if norm == 'mean':
            result /= len(sentiments)
        elif norm == 'sum':
            pass
        elif norm == 'sqrt':
            result /= np.sqrt(len(sentiments))
        else:
            raise("Wrong ''norm'' argument")
    else:
        result = (0, 0, 0, 0, 0)
    return dict(zip(keys, result))


 today = datetime.date.today()

 # Matplotlib x-axis date formatting
 days_locations = matplotlib.dates.DayLocator()
 months_locations = matplotlib.dates.MonthLocator()
 months_formatter = matplotlib.dates.DateFormatter("%Y %b")

 # Prepare URL and download for Wikipedia
 opener = urllib2.build_opener()
 opener.addheaders = [('User-agent', 'Finn Aarup Nielsen, +45 45 25 39 21')]
 urlbase = "http://en.wikipedia.org/w/api.php?"


 for company, fields in companies.items():
    wikipedia_revisions = []
    urlparam = {'action': 'query', 
                'format': 'json', 
                'prop': 'revisions',
                'rvlimit': 50,
                'rvprop': 'ids|timestamp|content', 
                'titles': fields['wikipedia']}
    for i in range(7):
        url = urlbase + urllib.urlencode(urlparam)
        wikipedia_result = json.load(opener.open(url))
        wikipedia_revisions.extend(wikipedia_result['query']['pages'].values()[0]['revisions'])
        print("%s: %d" % (company, len(wikipedia_revisions)))
        if 'query-continue' in wikipedia_result: 
            urlparam.update(wikipedia_result['query-continue']['revisions'])
        else:
            break
    wikipedia_last_timestamp = wikipedia_revisions[-1]['timestamp']
    wikipedia_last_datetime = dateutil.parser.parse(wikipedia_last_timestamp)
    wikipedia_last_date = datetime.datetime.date(wikipedia_last_datetime)
    for n, revision in enumerate(wikipedia_revisions):
        wikipedia_revisions[n].update(sentiment(revision['*']))
    companies[company].update({'wikipedia_revisions': copy.deepcopy(wikipedia_revisions)})
    companies[company].update({'quotes': matplotlib.finance.quotes_historical_yahoo(fields['stock'], wikipedia_last_date, today)})
    xaxis_range = matplotlib.dates.date2num(wikipedia_last_date), matplotlib.dates.date2num(today)
    fig = plt.figure() 
    for i in range(1,3):
        ax = fig.add_subplot(2, 1, i)
        ax.xaxis.set_major_locator(months_locations)
        ax.xaxis.set_minor_locator(days_locations)
        ax.xaxis.set_major_formatter(months_formatter)
        if i == 1:
            quotes = companies[company]['quotes']
            h = matplotlib.finance.candlestick(ax, quotes)
            h = plt.ylabel('Stock prize')
            h = plt.title(company)
        else:
            x = map(lambda fs: matplotlib.dates.date2num(dateutil.parser.parse(fs['timestamp'])), wikipedia_revisions) 
            y = map(lambda fs: fs['sentiment'], wikipedia_revisions)
            h = plt.plot(x, y)
            h = plt.xlabel('Date')
            h = plt.ylabel('Wikipedia sentiment')
        h = ax.set_xlim(xaxis_range)
        fig.autofmt_xdate()
    plt.show()
	import urllib, urllib2
	import simplejson as json
	import dateutil.parser
	import datetime
	import matplotlib.dates
	import matplotlib.finance
	from matplotlib import pyplot as plt
	import nltk.corpus
	import numpy as np
	import re
	import copy


	companies = {
	'Novo Nordisk': {'stock': 'NVO', 'wikipedia': 'Novo_Nordisk'},
	'Pfizer': {'stock': 'PFE', 'wikipedia': 'Pfizer'}
	}


	filebase = '/home/fn/'

	# Sentiment word list
	# AFINN-111 is as of June 2011 the most recent version of AFINN
	filename_afinn = filebase + '/data/AFINN/AFINN-111.txt'
	afinn = dict(map(lambda (w, s): (unicode(w, 'utf-8'), int(s)), [
	ws.strip().split('\t') for ws in open(filename_afinn) ]))

	stopwords = nltk.corpus.stopwords.words('english')
	stopwords = dict(zip(stopwords, stopwords))


	# Word splitter pattern
	pattern_split = re.compile(r"[^\w-]+", re.UNICODE)

	def sentiment(text, norm='sqrt'):
	"""
	Sentiment analysis.
	(sentiment, arousal, ambivalence, positive, negative) = sentiment(test)
	"""
	words_with_stopwords = pattern_split.split(text.lower())
	# Exclude stopwords:
	words = filter(lambda w: not stopwords.has_key(w), words_with_stopwords)
	sentiments = map(lambda word: afinn.get(word, 0), words)
	keys = ['sentiment', 'arousal', 'ambivalence', 'positive', 'negative']
	if sentiments:
	sentiments = np.asarray(sentiments).astype(float)
	sentiment = np.sum(sentiments)
	arousal = np.sum(np.abs(sentiments))
	ambivalence = arousal - np.abs(sentiment)
	positive = np.sum(np.where(sentiments>0, sentiments, 0))
	negative = - np.sum(np.where(sentiments<0, sentiments, 0))
	result = np.asarray([sentiment, arousal, ambivalence, positive, negative])
	if norm == 'mean':
	result /= len(sentiments)
	elif norm == 'sum':
	pass
	elif norm == 'sqrt':
	result /= np.sqrt(len(sentiments))
	else:
	raise("Wrong ''norm'' argument")
	else:
	result = (0, 0, 0, 0, 0)
	return dict(zip(keys, result))


	today = datetime.date.today()

	# Matplotlib x-axis date formatting
	days_locations = matplotlib.dates.DayLocator()
	months_locations = matplotlib.dates.MonthLocator()
	months_formatter = matplotlib.dates.DateFormatter("%Y %b")

	# Prepare URL and download for Wikipedia
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Finn Aarup Nielsen, +45 45 25 39 21')]
	urlbase = "http://en.wikipedia.org/w/api.php?"


	for company, fields in companies.items():
	wikipedia_revisions = []
	urlparam = {'action': 'query',
	'format': 'json',
	'prop': 'revisions',
	'rvlimit': 50,
	'rvprop': 'ids\|timestamp\|content',
	'titles': fields['wikipedia']}
	for i in range(7):
	url = urlbase + urllib.urlencode(urlparam)
	wikipedia_result = json.load(opener.open(url))
	wikipedia_revisions.extend(wikipedia_result['query']['pages'].values()[0]['revisions'])
	print("%s: %d" % (company, len(wikipedia_revisions)))
	if 'query-continue' in wikipedia_result:
	urlparam.update(wikipedia_result['query-continue']['revisions'])
	else:
	break
	wikipedia_last_timestamp = wikipedia_revisions[-1]['timestamp']
	wikipedia_last_datetime = dateutil.parser.parse(wikipedia_last_timestamp)
	wikipedia_last_date = datetime.datetime.date(wikipedia_last_datetime)
	for n, revision in enumerate(wikipedia_revisions):
	wikipedia_revisions[n].update(sentiment(revision['*']))
	companies[company].update({'wikipedia_revisions': copy.deepcopy(wikipedia_revisions)})
	companies[company].update({'quotes': matplotlib.finance.quotes_historical_yahoo(fields['stock'], wikipedia_last_date, today)})
	xaxis_range = matplotlib.dates.date2num(wikipedia_last_date), matplotlib.dates.date2num(today)
	fig = plt.figure()
	for i in range(1,3):
	ax = fig.add_subplot(2, 1, i)
	ax.xaxis.set_major_locator(months_locations)
	ax.xaxis.set_minor_locator(days_locations)
	ax.xaxis.set_major_formatter(months_formatter)
	if i == 1:
	quotes = companies[company]['quotes']
	h = matplotlib.finance.candlestick(ax, quotes)
	h = plt.ylabel('Stock prize')
	h = plt.title(company)
	else:
	x = map(lambda fs: matplotlib.dates.date2num(dateutil.parser.parse(fs['timestamp'])), wikipedia_revisions)
	y = map(lambda fs: fs['sentiment'], wikipedia_revisions)
	h = plt.plot(x, y)
	h = plt.xlabel('Date')
	h = plt.ylabel('Wikipedia sentiment')
	h = ax.set_xlim(xaxis_range)
	fig.autofmt_xdate()
	plt.show()