Created
December 18, 2015 22:18
-
-
Save DominicBM/fd7cf16cdbd1f9acb5ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, json, csv, urllib, argparse | |
## This is what allows the user to pass the initial Wikipedia category as an argument, such as'--c "History of the United States"'. | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--c', dest='cat', metavar='CAT', | |
action='store') | |
args = parser.parse_args() | |
## The script will create two CSVs. One with the articles and page views, and another that is a running list of subcategories, so that it can continue to run down the list and take each new category in turn. Here, the names of the CSVs are generated from the initial category given by the user, and a set is created, starting with that category, to ensure duplicates are not added. | |
category_list = set() | |
category = 'Category:' + str(args.cat) | |
category_list.add(category) | |
catscsv = str(args.cat).replace(' ', '_') + ' - cats.csv' | |
viewscsv = str(args.cat).replace(' ', '_') + ' - views.csv' | |
with open(catscsv, 'w') as log : | |
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
writelog.writerow( (category.encode('utf-8'), ) ) | |
log.close() | |
# Read the category CSV, to add any new categories to the running list. | |
x = 0 | |
while x < len(category_list) : | |
with open(catscsv, 'r') as log : | |
readlog = list(csv.reader(log, delimiter= '\t', quoting=csv.QUOTE_ALL)) | |
category = readlog[x][0] | |
for row in readlog: | |
if row[0] not in category_list: | |
category_list.add(row[0]) | |
x = x + 1 | |
log.close() | |
geturl = 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=' + category + '&cmlimit=500&format=json' | |
print geturl | |
parsed = json.loads(requests.get(geturl).text) | |
n = 0 | |
while n < len(parsed['query']['categorymembers']): | |
ns = parsed['query']['categorymembers'][n]['ns'] | |
if ns != 14: | |
print parsed['query']['categorymembers'][n]['title'] | |
title = parsed['query']['categorymembers'][n]['title'].replace(' ', '_') | |
print urllib.quote_plus(title.encode('utf-8')) | |
viewurl = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/' + urllib.quote_plus(title.encode('utf-8')) + '/daily/20151208/20151215' | |
viewed = json.loads(requests.get(viewurl).text) | |
try: | |
total_views = viewed['items'][0]['views'] + viewed['items'][1]['views'] + viewed['items'][2]['views'] + viewed['items'][3]['views'] + viewed['items'][4]['views'] + viewed['items'][5]['views'] + viewed['items'][6]['views'] | |
except IndexError: | |
total_views = 0 | |
except KeyError: | |
total_views = 'not found' | |
print total_views | |
with open(viewscsv, 'a') as log : | |
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
writelog.writerow( (title.encode('utf-8'), total_views ) ) | |
log.close() | |
if ns == 14: | |
with open(catscsv, 'a') as log : | |
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
if parsed['query']['categorymembers'][n]['title'].encode('utf-8') not in category_list: | |
writelog.writerow( (parsed['query']['categorymembers'][n]['title'].encode('utf-8'), ) ) | |
category_list.add( parsed['query']['categorymembers'][n]['title'].encode('utf-8') ) | |
log.close() | |
print 'CAT ' + parsed['query']['categorymembers'][n]['title'] | |
n = n + 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment