Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Last active August 26, 2019 00:28
Show Gist options
  • Save zacharysyoung/43558ee676bb4f7d366c02840323f228 to your computer and use it in GitHub Desktop.
Save zacharysyoung/43558ee676bb4f7d366c02840323f228 to your computer and use it in GitHub Desktop.
Get revisions for a Wiki article, in this case, 'Once Upon a Time in Hollywood'
"""
I got really interested in how many edits 'Once Upon a Time in Hollywood' got after
reading about the contentious issue of whether or not the Plot section should include
information deemed to be a spoiler, and the kinda-heated debate on the Talk page.
This gist also serves as my own documentation for using the WikiMedia query/revisions API.
"""
import json
import pickle
import pprint
from urllib import parse, request
from collections import Counter
revs_pkl = 'revs.pkl'
# Once Upon a Time in Hollywood
pageid = '56717294'
def download():
base_path = 'https://en.wikipedia.org/w/api.php?%s'
base_params = {
'action':'query',
'prop':'revisions',
'pageids':pageid,
'rvprop':'timestamp',
'rvlimit':'max',
'format':'json'
}
i = 1
revs = []
path = base_path % parse.urlencode(base_params)
while True:
print(i)
with request.urlopen(path) as f:
d = json.loads(f.read())
revs.extend([x['timestamp'] for x in d['query']['pages'][pageid]['revisions']])
if 'continue' in d:
base_params['rvcontinue'] = d['continue']['rvcontinue']
else:
break
path = base_path % parse.urlencode(base_params)
i += 1
with open(revs_pkl, 'w+b') as f:
f.write(pickle.dumps(revs))
try:
print('Trying to read %s' % revs_pkl)
with open(revs_pkl, 'r+b') as f:
revs = pickle.loads(f.read())
except OSError:
print('Couldn\'t find %s, downloading...' % revs_pkl)
download()
print('Trying to read %s' % revs_pkl)
with open(revs_pkl, 'r+b') as f:
revs = pickle.loads(f.read())
print('%d revisions' % len(revs))
# For an RFC 3339 datetime, use full date and just the hour
counter = Counter([x[:13] for x in revs])
print('10 edits/hour, or more')
pprint.pprint([x for x in counter.most_common() if x[1] >= 10])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment