Created
December 13, 2012 12:44
-
-
Save ndpar/4276136 to your computer and use it in GitHub Desktop.
Analyzing popularity on StackOverflow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import zlib | |
import json | |
import pymongo | |
import sys | |
URL = 'http://api.stackoverflow.com/1.1' | |
ANSWERS = URL + '/questions/{0}/answers' | |
def load_url(url): | |
response = zlib.decompress(urllib2.urlopen(url).read(), 16 + zlib.MAX_WBITS) | |
return json.loads(response) | |
def answers(qid): | |
url = ANSWERS.format(qid) | |
return load_url(url) | |
def update(coll, id, answers): | |
coll.update({'_id':id}, {'$set': {'answers':answers}}) | |
def main(): | |
c = pymongo.MongoClient(host='mongodb://localhost:27017', w=1, j=True) | |
db = c.test | |
coll = db.stackoverflow | |
try: | |
iter = coll.find({'answers':{'$exists':False}}) | |
for q in iter: | |
qid = q['_id'] | |
update(coll, qid, answers(qid)['answers']) | |
except: | |
print 'Error trying to write to collection', sys.exc_info() | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import zlib | |
import json | |
import math | |
import pymongo | |
import sys | |
URL = 'http://api.stackoverflow.com/1.1' | |
TAGGED = URL + '/search?tagged={0}&page={1}' | |
def load_url(url): | |
response = zlib.decompress(urllib2.urlopen(url).read(), 16 + zlib.MAX_WBITS) | |
return json.loads(response) | |
def load_page(tag, page): | |
url = TAGGED.format(tag, page) | |
return load_url(url) | |
def concat(lists, initializer = []): | |
return reduce(lambda x, y: x + y, lists, initializer) | |
def load_pages(tag): | |
first = load_page(tag, 1) | |
total_pages = int(math.ceil(1. * first['total'] / first['pagesize'])) | |
return [first] + [load_page(tag, i+1) for i in range(1, total_pages)] | |
def questions(tag): | |
pages = load_pages(tag) | |
return concat([p['questions'] for p in pages]) | |
def insert(coll, records): | |
for r in records: | |
r['_id'] = r['question_id'] | |
coll.save(r) | |
def main(tags): | |
c = pymongo.MongoClient(host='mongodb://localhost:27017', w=1, j=True) | |
db = c.test | |
coll = db.stackoverflow | |
try: | |
for tag in tags: | |
insert(coll, questions(tag)) | |
except: | |
print 'Error trying to write to collection', sys.exc_info() | |
main(['activemq', 'rabbitmq', 'zeromq', 'hornetq']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pymongo | |
import sys | |
import matplotlib.pyplot as plt | |
from datetime import date | |
from collections import defaultdict | |
def aggregate(query): | |
c = pymongo.MongoClient(host='mongodb://localhost:27017', w=1, j=True) | |
db = c.test | |
coll = db.stackoverflow | |
try: | |
return coll.aggregate(query)['result'] | |
except: | |
print 'Error trying to query collection', sys.exc_info() | |
def bucket(timestamp): | |
d = date.fromtimestamp(timestamp) | |
year, month = d.year, d.month | |
return 12 * (year - 2008) + month | |
def frequencies(buckets): | |
result = defaultdict(int) | |
for b in buckets: result[b] += 1 | |
return result | |
def sparse_array(freqs): | |
result = [0] * 60 | |
for f in freqs: result[f-1] += freqs[f] | |
return result | |
def trends(): | |
rs = aggregate([ | |
{'$unwind':'$tags'}, | |
{'$group':{'_id':'$tags', 'published':{'$push':'$creation_date'}}}, | |
{'$match':{'_id':{'$in':['activemq', 'rabbitmq', 'zeromq', 'hornetq']}}} | |
]) | |
result = {} | |
for e in rs: | |
broker, dates = e['_id'], e['published'] | |
result[broker] = sparse_array(frequencies([bucket(d) for d in dates])) | |
return result | |
def plot(trends): | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
for e in trends: | |
ax.plot(range(59), trends[e][:-1], label=e) | |
handles, labels = ax.get_legend_handles_labels() | |
ax.legend(handles, labels, loc=2) | |
plt.show() | |
plot(trends()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment