Skip to content

Instantly share code, notes, and snippets.

@jalperin
Created March 27, 2014 19:54
Show Gist options
  • Save jalperin/9816793 to your computer and use it in GitHub Desktop.
Save jalperin/9816793 to your computer and use it in GitHub Desktop.
This is some hacked together code that wont run on its own (as its part of a largest script that uses python Pandas). But this shows the logic and queries that can be made from the crossref API to search for a DOI based on some metadata and then validate it using some sanity checks.
import json
import urllib, urllib2
import urlparse
from unidecode import unidecode
CROSSREF_API_DOI = 'http://search.crossref.org/dois?'
def verify_doi(doi,article):
doi_query_url = 'http://search.crossref.org/dois?' + urllib.urlencode({'q': doi})
response = json.loads(urllib2.urlopen(doi_query_url).read())
# make sure the DOI is in API
# should always be true, since we got the DOI from the API
if (not response or len(response) == 0):
print '%s, doi not found in API' % doi
return False
found_doi = response[0]['doi']
# often the article ID is part of the DOI,
# if that's the case, we found the right DOI
if article.id.lower() in response[0]['doi'].lower():
# print 'id is part of DOI'
return doi
# most SciELO DOIs resolve to scielo.
# check the resolved URL
try:
resolved_url = urllib2.urlopen(response[0]['doi']).geturl()
qs = urlparse.urlparse(resolved_url).query # grab the query string
if qs and 'pid' in qs:
if article.id.lower() == urlparse.parse_qs(qs)['pid'][0].lower():
# print 'id is part of the resolved URL'
return doi
elif 'scielo' in resolved_url:
# print 'id not found in URL, but it is a scielo URL'
return False
except urllib2.HTTPError, e:
print 'HTTPError, try http://dx.doi.org/%s' % response[0]['doi']
coins = urlparse.parse_qs(response[0]['coins'])
# check some metadata
authors = article.authors.split(';')
if len(authors) and 'rft.aulast' in coins:
first_author = authors[0].split(',')
# try comparing last names only
if first_author[0].lower() == coins['rft.aulast'][0].lower():
# print 'last name of first author is same on both'
return doi
if 'rft.aufirst' in coins:
# try comparing first and last together
# to handle cases where last names don't get split correctly
first_author_article = ' '.join(first_author[::-1]).replace(' ', '').lower()
first_author_response = (coins['rft.aufirst'][0] + coins['rft.aulast'][0]).replace(' ', '').lower()
# FIXME: this check could be improved:
# - strip accents
# - similarity comparison (allow a few characters difference)
if first_author_article == first_author_response:
return doi
# print 'did not match on anything. Likely wrong DOI'
return False
def doi_query1(article):
data = {}
data['q'] = article.original_title.encode('utf8') + ' '
data['q'] += str(article.any_issn).encode('utf8') + ' '
# data['q'] += ' ' + article.html_url.encode('utf8') + ' '
data['year'] = str(article.publication_date)[0:4]
return urllib.urlencode(data)
def doi_query2(article):
data = {}
data['q'] = article.authors.encode('utf8') + ' '
data['q'] += str(article.any_issn).encode('utf8') + ' '
data['year'] = str(article.publication_date)[0:4]
return urllib.urlencode(data)
def search_doi(article):
# do nothing if we have a DOI
if type(article.doi) == str: return True
response = urllib2.urlopen(CROSSREF_API_DOI + doi_query1(article)).read()
response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()
# print "first attempt:",
if verify_doi(response_doi, article):
article['doi'] = response_doi
print 'found'
return True
# try a second query
response = urllib2.urlopen(CROSSREF_API_DOI + doi_query2(article)).read()
response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()
if verify_doi(response_doi, article):
article['doi'] = response_doi
print 'found'
return True
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment