Created
July 12, 2011 06:24
-
-
Save ketralnis/1077498 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt # For plotting graphs. | |
from contextlib import contextmanager | |
from collections import namedtuple | |
import urllib | |
import json | |
import os, os.path | |
import random | |
from votemining import to36 | |
class data(object): | |
def __init__(self, f, types): | |
self.f = open(f) | |
self.i = iter(self.f) | |
self.types = types | |
def __iter__(self): | |
return self | |
def next(self): | |
try: | |
line = self.i.next() | |
except StopIteration: | |
self.f.close() | |
raise | |
line = line.strip() | |
fields = line.split(',') | |
if self.types: | |
fields = [f(d) for (f, d) in zip(self.types, fields)] | |
return fields | |
def all(self): | |
return list(self) | |
def memoize(fn): | |
memo = {} | |
def _fn(*a): | |
a = tuple(a) | |
try: | |
return memo[a] | |
except KeyError: | |
ret = memo[a] = fn(*a) | |
return ret | |
return _fn | |
@memoize | |
def reddit_title(linkid): | |
id36 = to36(linkid) | |
url = 'http://www.reddit.com/by_id/t3_%s.json' % to36(linkid) | |
js = json.loads(urllib.urlopen(url).read()) | |
try: | |
title = js['data']['children'][0]['data']['title'] | |
except IndexError: | |
return '(unknown)' | |
print linkid, '->', title | |
return title | |
@contextmanager | |
def csvplot(fname, pngname, types = []): | |
d = data(fname, types) | |
yield (d, plt) | |
plt.savefig('data/pngs/'+pngname+'.png') | |
plt.clf() | |
@contextmanager | |
def plot(pngname): | |
yield plt | |
plt.savefig('data/pngs/'+pngname+'.png') | |
plt.clf() | |
with csvplot('data/dirhist.csv', 'dirhist', (int, float)) as (d, plt): | |
plt.title('Vote directions') | |
dirs = dict(d) | |
downs = dirs[-1] | |
nones = dirs[0] | |
ups = dirs[1] | |
plt.pie([downs, nones, ups], labels=['downs', 'nones', 'ups'], autopct='%1.1f%%') | |
with csvplot('data/timeofdayhist.csv', 'timeofdayhist', (int, int)) as (d, plt): | |
plt.title('Number of votes per hour') | |
plt.ylabel('# votes') | |
plt.xlabel('hour in GMT') | |
d = list(d) | |
plt.bar([x[0] for x in d], [x[1] for x in d]) | |
plt.xticks([x[0]+0.5 for x in d], [x[0] for x in d]) | |
plt.xlim(d[0][0], d[-1][0]+1) | |
with csvplot('data/scorehist.csv', 'scorehist', (int, int)) as (d, plt): | |
plt.title('Histogram of score per link') | |
plt.ylabel('# links') | |
plt.xlabel('score ranges') | |
d = list(d) | |
xs = [] | |
ys = [] | |
ticks = [] | |
labels = [] | |
for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])): | |
xs.append(n) | |
ys.append(thiscount) | |
ticks.append(n+0.5) | |
if thismin == -1: | |
labels.append('<=0') | |
else: | |
labels.append('(%s..%s]' % (thismin, nextmin)) | |
#print n, (thismin, thiscount), (nextmin, nextcount) | |
plt.bar(xs, ys) | |
plt.xticks(ticks, labels, rotation=30, size='small') | |
#plt.xlim(d[0][0]+1, d[-1][0]+1) | |
with csvplot('data/numvoteshist.csv', 'numvoteshist', (int, int)) as (d, plt): | |
plt.title('Histogram of number of votes received per link') | |
plt.ylabel('# links') | |
plt.xlabel('# of votes') | |
d = list(d) | |
xs = [] | |
ys = [] | |
ticks = [] | |
labels = [] | |
for n, ((thismin, thiscount), (nextmin, nextcount)) in enumerate(zip(d, d[1:])): | |
xs.append(n) | |
ys.append(thiscount) | |
ticks.append(n+0.5) | |
if thismin == -1: | |
# shouldn't happen | |
labels.append('<=0') | |
else: | |
labels.append('(%s..%s]' % (thismin, nextmin)) | |
#print n, (thismin, thiscount), (nextmin, nextcount) | |
plt.bar(xs, ys) | |
plt.xticks(ticks, labels, rotation=30, size='small') | |
#plt.xlim(d[0][0]+1, d[-1][0]+1) | |
# now let's pick some links at random to analyse | |
d = 'data/scorebytime' | |
links = {} | |
for h in os.listdir(d): | |
for l in os.listdir(os.path.join(d, h)): | |
links[int(l.split('.')[0])] = os.path.join(d, h, l) | |
for x in range(10): | |
picks = random.sample(links.keys(), 100) | |
with plot('randommulti_%d' % x) as plt: | |
plt.title('Progression of scores of some random links over time') | |
plt.xlabel('timestamp') | |
plt.ylabel('score') | |
graphs = [] | |
for pick in picks: | |
xs = [] | |
ys = [] | |
for timestamp, score in data(links[pick], (float, int)): | |
xs.append(timestamp), | |
ys.append(score) | |
graphs.extend([xs, ys]) | |
# TODO: titles | |
plt.plot(*graphs) | |
#plt.legend() | |
# the few with the largest numbers of votes | |
largest = sorted(links.keys(), | |
key=lambda x: os.stat(links[x]).st_size, | |
reverse=True)[:15] | |
for x in largest: | |
with plot('choicesingle_%d' % x) as plt: | |
title = 'Progression of a link over time: ' + reddit_title(x) | |
plt.title(title) | |
plt.xlabel('timestamp') | |
plt.ylabel('score') | |
xs = [] | |
ys = [] | |
for timestamp, score in data(links[x], (float, int)): | |
xs.append(timestamp), | |
ys.append(score) | |
plt.plot(xs, ys) | |
with plot('largestmulti_score') as plt: | |
plt.title('Progression of some larged-scored links over time') | |
plt.xlabel('timestamp') | |
plt.ylabel('score') | |
# TODO: titles in legend? | |
graphs = [] | |
for pick in largest: | |
xs = [] | |
ys = [] | |
for timestamp, score in data(links[pick], (float, int)): | |
xs.append(timestamp), | |
ys.append(score) | |
graphs.extend([xs, ys]) | |
plt.plot(*graphs) | |
Graph = namedtuple('Graph', ('xs', 'ys')) | |
respect_top = 25 | |
# we have a three-day dataset; this starts at the second day | |
after = 1309832976.63 | |
before = lambda h: after+(h*60*60) | |
for hours in 6, 12, 24, 48: | |
with plot('chunkedreplay_%d' % hours) as plt: | |
# TODO: labels? | |
plt.title('Progression of the front page over %d hours' % hours) | |
plt.xlabel('timestamp') | |
plt.ylabel('rank') | |
clinks = {} | |
for timestamp, linkid, hot, rank in data('data/snapshots.csv', [float, int, float, int]): | |
if rank > respect_top or timestamp < after or timestamp > before(hours): | |
continue | |
if linkid not in clinks: | |
clinks[linkid] = Graph([], []) | |
clinks[linkid].xs.append(timestamp) | |
# this was 0-indexed on generation | |
rank += 1 | |
clinks[linkid].ys.append(rank) | |
graphs = [] | |
for linkid, g in clinks.iteritems(): | |
graphs.append(g.xs) | |
graphs.append(g.ys) | |
plt.plot(*graphs) | |
# reverse the axes (this must be done after the graph has been plotted in ) | |
ax = plt.gca() | |
ax.set_ylim([respect_top,1]) | |
plt.yticks(range(1, respect_top)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment