Last active
August 26, 2019 00:28
-
-
Save zacharysyoung/43558ee676bb4f7d366c02840323f228 to your computer and use it in GitHub Desktop.
Get revisions for a Wiki article, in this case, 'Once Upon a Time in Hollywood'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
I got really interested in how many edits 'Once Upon a Time in Hollywood' got after | |
reading about the contentious issue of whether or not the Plot section should include | |
information deemed to be a spoiler, and the kinda-heated debate on the Talk page. | |
This gist also serves as my own documentation for using the WikiMedia query/revisions API. | |
""" | |
import json | |
import pickle | |
import pprint | |
from urllib import parse, request | |
from collections import Counter | |
revs_pkl = 'revs.pkl' | |
# Once Upon a Time in Hollywood | |
pageid = '56717294' | |
def download(): | |
base_path = 'https://en.wikipedia.org/w/api.php?%s' | |
base_params = { | |
'action':'query', | |
'prop':'revisions', | |
'pageids':pageid, | |
'rvprop':'timestamp', | |
'rvlimit':'max', | |
'format':'json' | |
} | |
i = 1 | |
revs = [] | |
path = base_path % parse.urlencode(base_params) | |
while True: | |
print(i) | |
with request.urlopen(path) as f: | |
d = json.loads(f.read()) | |
revs.extend([x['timestamp'] for x in d['query']['pages'][pageid]['revisions']]) | |
if 'continue' in d: | |
base_params['rvcontinue'] = d['continue']['rvcontinue'] | |
else: | |
break | |
path = base_path % parse.urlencode(base_params) | |
i += 1 | |
with open(revs_pkl, 'w+b') as f: | |
f.write(pickle.dumps(revs)) | |
try: | |
print('Trying to read %s' % revs_pkl) | |
with open(revs_pkl, 'r+b') as f: | |
revs = pickle.loads(f.read()) | |
except OSError: | |
print('Couldn\'t find %s, downloading...' % revs_pkl) | |
download() | |
print('Trying to read %s' % revs_pkl) | |
with open(revs_pkl, 'r+b') as f: | |
revs = pickle.loads(f.read()) | |
print('%d revisions' % len(revs)) | |
# For an RFC 3339 datetime, use full date and just the hour | |
counter = Counter([x[:13] for x in revs]) | |
print('10 edits/hour, or more') | |
pprint.pprint([x for x in counter.most_common() if x[1] >= 10]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment