Skip to content

Instantly share code, notes, and snippets.

@DominicBM
Created December 18, 2015 22:18
Show Gist options
  • Save DominicBM/fd7cf16cdbd1f9acb5ee to your computer and use it in GitHub Desktop.
Save DominicBM/fd7cf16cdbd1f9acb5ee to your computer and use it in GitHub Desktop.
import requests, json, csv, urllib, argparse
## This is what allows the user to pass the initial Wikipedia category as an argument, such as'--c "History of the United States"'.
parser = argparse.ArgumentParser()
parser.add_argument('--c', dest='cat', metavar='CAT',
action='store')
args = parser.parse_args()
## The script will create two CSVs. One with the articles and page views, and another that is a running list of subcategories, so that it can continue to run down the list and take each new category in turn. Here, the names of the CSVs are generated from the initial category given by the user, and a set is created, starting with that category, to ensure duplicates are not added.
category_list = set()
category = 'Category:' + str(args.cat)
category_list.add(category)
catscsv = str(args.cat).replace(' ', '_') + ' - cats.csv'
viewscsv = str(args.cat).replace(' ', '_') + ' - views.csv'
with open(catscsv, 'w') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
writelog.writerow( (category.encode('utf-8'), ) )
log.close()
# Read the category CSV, to add any new categories to the running list.
x = 0
while x < len(category_list) :
with open(catscsv, 'r') as log :
readlog = list(csv.reader(log, delimiter= '\t', quoting=csv.QUOTE_ALL))
category = readlog[x][0]
for row in readlog:
if row[0] not in category_list:
category_list.add(row[0])
x = x + 1
log.close()
geturl = 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=' + category + '&cmlimit=500&format=json'
print geturl
parsed = json.loads(requests.get(geturl).text)
n = 0
while n < len(parsed['query']['categorymembers']):
ns = parsed['query']['categorymembers'][n]['ns']
if ns != 14:
print parsed['query']['categorymembers'][n]['title']
title = parsed['query']['categorymembers'][n]['title'].replace(' ', '_')
print urllib.quote_plus(title.encode('utf-8'))
viewurl = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/' + urllib.quote_plus(title.encode('utf-8')) + '/daily/20151208/20151215'
viewed = json.loads(requests.get(viewurl).text)
try:
total_views = viewed['items'][0]['views'] + viewed['items'][1]['views'] + viewed['items'][2]['views'] + viewed['items'][3]['views'] + viewed['items'][4]['views'] + viewed['items'][5]['views'] + viewed['items'][6]['views']
except IndexError:
total_views = 0
except KeyError:
total_views = 'not found'
print total_views
with open(viewscsv, 'a') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
writelog.writerow( (title.encode('utf-8'), total_views ) )
log.close()
if ns == 14:
with open(catscsv, 'a') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
if parsed['query']['categorymembers'][n]['title'].encode('utf-8') not in category_list:
writelog.writerow( (parsed['query']['categorymembers'][n]['title'].encode('utf-8'), ) )
category_list.add( parsed['query']['categorymembers'][n]['title'].encode('utf-8') )
log.close()
print 'CAT ' + parsed['query']['categorymembers'][n]['title']
n = n + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment