Created
June 26, 2014 20:45
-
-
Save jalperin/2620c7c5a80a3e21a5e5 to your computer and use it in GitHub Desktop.
Fetch all works for a CrossRef DOI prefix from the CrossRef API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""get_works.py: Fetch all the works from CrossRef API by DOI prefix.""" | |
__author__ = "Juan Pablo Alperin (@juancommander)" | |
import re | |
import urllib, urllib2 | |
import simplejson as json | |
from time import sleep | |
import datetime | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option("-f", "--from", dest="f", help="specify the start date YYYY-DD") | |
parser.add_option("-D", "--dir", dest="out_dir", default=".", help="specify the directory for output") | |
(options, args) = parser.parse_args() | |
CROSSREF_PREFIX_API = 'http://api.crossref.org/prefixes/%s/works' | |
prefix = args[0] | |
if not re.match('^10\..*', prefix): | |
print "Invalid prefix: %s" % prefix | |
exit(1) | |
start_date = options.f | |
if start_date and not re.match('^\d\d\d\d\-.*', start_date): | |
print "Invalid date, use YYYY-DD: %s" % start_date | |
exit(1) | |
out_dir = options.out_dir.strip('/') | |
tries = 0 | |
offset = 0 | |
rows = 50 | |
url = CROSSREF_PREFIX_API % prefix | |
data = {} | |
if start_date: | |
data['filter'] = "from-pub-date:%s" % start_date | |
lines = [] | |
while True: | |
try: | |
data['rows'] = rows | |
data['offset'] = offset | |
content = json.load(urllib2.urlopen(url + '?' + urllib.urlencode(data))) | |
total = int(content['message']['total-results']) | |
if (offset >= total): | |
break | |
offset += rows | |
items = content['message']['items'] | |
for item in items: | |
doi = item['DOI'] | |
try: | |
title = item['title'][0] | |
except IndexError: | |
title = 'untitled' | |
# grabbing only 3 pieces of metadata for printing DOI, pub_date, and title | |
date = "-".join([str(x).zfill(2) for x in item['indexed']['date-parts'][0]]) | |
lines.append(' '.join([doi, date, title, '\n'])) | |
except Exception, e: | |
print e | |
# 3 exceptions in a row and we give up | |
if tries > 3: | |
print "failed to fetch URL after 3 tries" | |
break | |
tries += 1 | |
# just pause and the loop will try again | |
sleep(3) | |
pass | |
# this prints it out in the format DOI pub_date title | |
if len(lines): | |
with open('%s/%s_%s.txt' % (out_dir, prefix, datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S.%f")), 'w') as ofile: | |
ofile.writelines([line.encode('utf8', 'ignore') for line in lines]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment