Created
June 12, 2012 17:33
-
-
Save fnielsen/2918911 to your computer and use it in GitHub Desktop.
Wikipedia/stock quote visualization with Python and matplotlib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, urllib2 | |
import simplejson as json | |
import dateutil.parser | |
import datetime | |
import matplotlib.dates | |
import matplotlib.finance | |
from matplotlib import pyplot as plt | |
import nltk.corpus | |
import numpy as np | |
import re | |
import copy | |
companies = { | |
'Novo Nordisk': {'stock': 'NVO', 'wikipedia': 'Novo_Nordisk'}, | |
'Pfizer': {'stock': 'PFE', 'wikipedia': 'Pfizer'} | |
} | |
filebase = '/home/fn/' | |
# Sentiment word list | |
# AFINN-111 is as of June 2011 the most recent version of AFINN | |
filename_afinn = filebase + '/data/AFINN/AFINN-111.txt' | |
afinn = dict(map(lambda (w, s): (unicode(w, 'utf-8'), int(s)), [ | |
ws.strip().split('\t') for ws in open(filename_afinn) ])) | |
stopwords = nltk.corpus.stopwords.words('english') | |
stopwords = dict(zip(stopwords, stopwords)) | |
# Word splitter pattern | |
pattern_split = re.compile(r"[^\w-]+", re.UNICODE) | |
def sentiment(text, norm='sqrt'): | |
""" | |
Sentiment analysis. | |
(sentiment, arousal, ambivalence, positive, negative) = sentiment(test) | |
""" | |
words_with_stopwords = pattern_split.split(text.lower()) | |
# Exclude stopwords: | |
words = filter(lambda w: not stopwords.has_key(w), words_with_stopwords) | |
sentiments = map(lambda word: afinn.get(word, 0), words) | |
keys = ['sentiment', 'arousal', 'ambivalence', 'positive', 'negative'] | |
if sentiments: | |
sentiments = np.asarray(sentiments).astype(float) | |
sentiment = np.sum(sentiments) | |
arousal = np.sum(np.abs(sentiments)) | |
ambivalence = arousal - np.abs(sentiment) | |
positive = np.sum(np.where(sentiments>0, sentiments, 0)) | |
negative = - np.sum(np.where(sentiments<0, sentiments, 0)) | |
result = np.asarray([sentiment, arousal, ambivalence, positive, negative]) | |
if norm == 'mean': | |
result /= len(sentiments) | |
elif norm == 'sum': | |
pass | |
elif norm == 'sqrt': | |
result /= np.sqrt(len(sentiments)) | |
else: | |
raise("Wrong ''norm'' argument") | |
else: | |
result = (0, 0, 0, 0, 0) | |
return dict(zip(keys, result)) | |
today = datetime.date.today() | |
# Matplotlib x-axis date formatting | |
days_locations = matplotlib.dates.DayLocator() | |
months_locations = matplotlib.dates.MonthLocator() | |
months_formatter = matplotlib.dates.DateFormatter("%Y %b") | |
# Prepare URL and download for Wikipedia | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Finn Aarup Nielsen, +45 45 25 39 21')] | |
urlbase = "http://en.wikipedia.org/w/api.php?" | |
for company, fields in companies.items(): | |
wikipedia_revisions = [] | |
urlparam = {'action': 'query', | |
'format': 'json', | |
'prop': 'revisions', | |
'rvlimit': 50, | |
'rvprop': 'ids|timestamp|content', | |
'titles': fields['wikipedia']} | |
for i in range(7): | |
url = urlbase + urllib.urlencode(urlparam) | |
wikipedia_result = json.load(opener.open(url)) | |
wikipedia_revisions.extend(wikipedia_result['query']['pages'].values()[0]['revisions']) | |
print("%s: %d" % (company, len(wikipedia_revisions))) | |
if 'query-continue' in wikipedia_result: | |
urlparam.update(wikipedia_result['query-continue']['revisions']) | |
else: | |
break | |
wikipedia_last_timestamp = wikipedia_revisions[-1]['timestamp'] | |
wikipedia_last_datetime = dateutil.parser.parse(wikipedia_last_timestamp) | |
wikipedia_last_date = datetime.datetime.date(wikipedia_last_datetime) | |
for n, revision in enumerate(wikipedia_revisions): | |
wikipedia_revisions[n].update(sentiment(revision['*'])) | |
companies[company].update({'wikipedia_revisions': copy.deepcopy(wikipedia_revisions)}) | |
companies[company].update({'quotes': matplotlib.finance.quotes_historical_yahoo(fields['stock'], wikipedia_last_date, today)}) | |
xaxis_range = matplotlib.dates.date2num(wikipedia_last_date), matplotlib.dates.date2num(today) | |
fig = plt.figure() | |
for i in range(1,3): | |
ax = fig.add_subplot(2, 1, i) | |
ax.xaxis.set_major_locator(months_locations) | |
ax.xaxis.set_minor_locator(days_locations) | |
ax.xaxis.set_major_formatter(months_formatter) | |
if i == 1: | |
quotes = companies[company]['quotes'] | |
h = matplotlib.finance.candlestick(ax, quotes) | |
h = plt.ylabel('Stock prize') | |
h = plt.title(company) | |
else: | |
x = map(lambda fs: matplotlib.dates.date2num(dateutil.parser.parse(fs['timestamp'])), wikipedia_revisions) | |
y = map(lambda fs: fs['sentiment'], wikipedia_revisions) | |
h = plt.plot(x, y) | |
h = plt.xlabel('Date') | |
h = plt.ylabel('Wikipedia sentiment') | |
h = ax.set_xlim(xaxis_range) | |
fig.autofmt_xdate() | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment