Created
March 27, 2014 19:54
-
-
Save jalperin/9816793 to your computer and use it in GitHub Desktop.
This is some hacked together code that wont run on its own (as its part of a largest script that uses python Pandas). But this shows the logic and queries that can be made from the crossref API to search for a DOI based on some metadata and then validate it using some sanity checks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import urllib, urllib2 | |
import urlparse | |
from unidecode import unidecode | |
CROSSREF_API_DOI = 'http://search.crossref.org/dois?' | |
def verify_doi(doi,article): | |
doi_query_url = 'http://search.crossref.org/dois?' + urllib.urlencode({'q': doi}) | |
response = json.loads(urllib2.urlopen(doi_query_url).read()) | |
# make sure the DOI is in API | |
# should always be true, since we got the DOI from the API | |
if (not response or len(response) == 0): | |
print '%s, doi not found in API' % doi | |
return False | |
found_doi = response[0]['doi'] | |
# often the article ID is part of the DOI, | |
# if that's the case, we found the right DOI | |
if article.id.lower() in response[0]['doi'].lower(): | |
# print 'id is part of DOI' | |
return doi | |
# most SciELO DOIs resolve to scielo. | |
# check the resolved URL | |
try: | |
resolved_url = urllib2.urlopen(response[0]['doi']).geturl() | |
qs = urlparse.urlparse(resolved_url).query # grab the query string | |
if qs and 'pid' in qs: | |
if article.id.lower() == urlparse.parse_qs(qs)['pid'][0].lower(): | |
# print 'id is part of the resolved URL' | |
return doi | |
elif 'scielo' in resolved_url: | |
# print 'id not found in URL, but it is a scielo URL' | |
return False | |
except urllib2.HTTPError, e: | |
print 'HTTPError, try http://dx.doi.org/%s' % response[0]['doi'] | |
coins = urlparse.parse_qs(response[0]['coins']) | |
# check some metadata | |
authors = article.authors.split(';') | |
if len(authors) and 'rft.aulast' in coins: | |
first_author = authors[0].split(',') | |
# try comparing last names only | |
if first_author[0].lower() == coins['rft.aulast'][0].lower(): | |
# print 'last name of first author is same on both' | |
return doi | |
if 'rft.aufirst' in coins: | |
# try comparing first and last together | |
# to handle cases where last names don't get split correctly | |
first_author_article = ' '.join(first_author[::-1]).replace(' ', '').lower() | |
first_author_response = (coins['rft.aufirst'][0] + coins['rft.aulast'][0]).replace(' ', '').lower() | |
# FIXME: this check could be improved: | |
# - strip accents | |
# - similarity comparison (allow a few characters difference) | |
if first_author_article == first_author_response: | |
return doi | |
# print 'did not match on anything. Likely wrong DOI' | |
return False | |
def doi_query1(article): | |
data = {} | |
data['q'] = article.original_title.encode('utf8') + ' ' | |
data['q'] += str(article.any_issn).encode('utf8') + ' ' | |
# data['q'] += ' ' + article.html_url.encode('utf8') + ' ' | |
data['year'] = str(article.publication_date)[0:4] | |
return urllib.urlencode(data) | |
def doi_query2(article): | |
data = {} | |
data['q'] = article.authors.encode('utf8') + ' ' | |
data['q'] += str(article.any_issn).encode('utf8') + ' ' | |
data['year'] = str(article.publication_date)[0:4] | |
return urllib.urlencode(data) | |
def search_doi(article): | |
# do nothing if we have a DOI | |
if type(article.doi) == str: return True | |
response = urllib2.urlopen(CROSSREF_API_DOI + doi_query1(article)).read() | |
response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower() | |
# print "first attempt:", | |
if verify_doi(response_doi, article): | |
article['doi'] = response_doi | |
print 'found' | |
return True | |
# try a second query | |
response = urllib2.urlopen(CROSSREF_API_DOI + doi_query2(article)).read() | |
response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower() | |
if verify_doi(response_doi, article): | |
article['doi'] = response_doi | |
print 'found' | |
return True | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment