jalperin · March 27, 2014 19:54
diff --git a/update_doi.py b/update_doi.py
 import json
 import urllib, urllib2
 import urlparse
 from unidecode import unidecode

 CROSSREF_API_DOI = 'http://search.crossref.org/dois?'

 def verify_doi(doi,article):
    doi_query_url = 'http://search.crossref.org/dois?' + urllib.urlencode({'q': doi})
    response = json.loads(urllib2.urlopen(doi_query_url).read())
    # make sure the DOI is in API
    # should always be true, since we got the DOI from the API
    if (not response or len(response) == 0):
        print '%s, doi not found in API' % doi
        return False
    
    found_doi = response[0]['doi']
    
    # often the article ID is part of the DOI, 
    # if that's the case, we found the right DOI
    if article.id.lower() in response[0]['doi'].lower():
 #         print 'id is part of DOI'
        return doi
    
    # most SciELO DOIs resolve to scielo. 
    # check the resolved URL
    try:
        resolved_url = urllib2.urlopen(response[0]['doi']).geturl()
        qs = urlparse.urlparse(resolved_url).query    # grab the query string
        if qs and 'pid' in qs:
            if article.id.lower() == urlparse.parse_qs(qs)['pid'][0].lower():
 #                 print 'id is part of the resolved URL'
                return doi
        elif 'scielo' in resolved_url:
 #             print 'id not found in URL, but it is a scielo URL'
            return False
    except urllib2.HTTPError, e:
        print 'HTTPError, try http://dx.doi.org/%s' % response[0]['doi']

    coins = urlparse.parse_qs(response[0]['coins'])
        
    # check some metadata
    authors = article.authors.split(';')
    if len(authors) and 'rft.aulast' in coins:
        first_author = authors[0].split(',')
        # try comparing last names only
        if first_author[0].lower() == coins['rft.aulast'][0].lower():
 #             print 'last name of first author is same on both'
            return doi

        if 'rft.aufirst' in coins:     
            # try comparing first and last together 
            # to handle cases where last names don't get split correctly
            first_author_article = ' '.join(first_author[::-1]).replace(' ', '').lower()
            first_author_response = (coins['rft.aufirst'][0] + coins['rft.aulast'][0]).replace(' ', '').lower()
            # FIXME: this check could be improved:
            # - strip accents
            # - similarity comparison (allow a few characters difference)
            if first_author_article == first_author_response:
                return doi
                   
    
 #     print 'did not match on anything. Likely wrong DOI'
    return False


 def doi_query1(article):
    data = {}
    data['q'] = article.original_title.encode('utf8') + ' '
    data['q'] += str(article.any_issn).encode('utf8') + ' '
 #     data['q'] += ' ' + article.html_url.encode('utf8') + ' '
    data['year'] = str(article.publication_date)[0:4]
    return urllib.urlencode(data)

 def doi_query2(article):
    data = {}
    data['q'] = article.authors.encode('utf8') + ' '
    data['q'] += str(article.any_issn).encode('utf8') + ' '
    data['year'] = str(article.publication_date)[0:4]
    return urllib.urlencode(data)

 def search_doi(article):
    # do nothing if we have a DOI
    if type(article.doi) == str: return True
    
    response = urllib2.urlopen(CROSSREF_API_DOI + doi_query1(article)).read()
    response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()
 #     print "first attempt:", 
    if verify_doi(response_doi, article):
        article['doi'] = response_doi
        print 'found'
        return True

    # try a second query
    response = urllib2.urlopen(CROSSREF_API_DOI + doi_query2(article)).read()
    response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()

    if verify_doi(response_doi, article):
        article['doi'] = response_doi
        print 'found'
        return True

    return False
	import json
	import urllib, urllib2
	import urlparse
	from unidecode import unidecode

	CROSSREF_API_DOI = 'http://search.crossref.org/dois?'

	def verify_doi(doi,article):
	doi_query_url = 'http://search.crossref.org/dois?' + urllib.urlencode({'q': doi})
	response = json.loads(urllib2.urlopen(doi_query_url).read())
	# make sure the DOI is in API
	# should always be true, since we got the DOI from the API
	if (not response or len(response) == 0):
	print '%s, doi not found in API' % doi
	return False

	found_doi = response[0]['doi']

	# often the article ID is part of the DOI,
	# if that's the case, we found the right DOI
	if article.id.lower() in response[0]['doi'].lower():
	# print 'id is part of DOI'
	return doi

	# most SciELO DOIs resolve to scielo.
	# check the resolved URL
	try:
	resolved_url = urllib2.urlopen(response[0]['doi']).geturl()
	qs = urlparse.urlparse(resolved_url).query # grab the query string
	if qs and 'pid' in qs:
	if article.id.lower() == urlparse.parse_qs(qs)['pid'][0].lower():
	# print 'id is part of the resolved URL'
	return doi
	elif 'scielo' in resolved_url:
	# print 'id not found in URL, but it is a scielo URL'
	return False
	except urllib2.HTTPError, e:
	print 'HTTPError, try http://dx.doi.org/%s' % response[0]['doi']

	coins = urlparse.parse_qs(response[0]['coins'])

	# check some metadata
	authors = article.authors.split(';')
	if len(authors) and 'rft.aulast' in coins:
	first_author = authors[0].split(',')
	# try comparing last names only
	if first_author[0].lower() == coins['rft.aulast'][0].lower():
	# print 'last name of first author is same on both'
	return doi

	if 'rft.aufirst' in coins:
	# try comparing first and last together
	# to handle cases where last names don't get split correctly
	first_author_article = ' '.join(first_author[::-1]).replace(' ', '').lower()
	first_author_response = (coins['rft.aufirst'][0] + coins['rft.aulast'][0]).replace(' ', '').lower()
	# FIXME: this check could be improved:
	# - strip accents
	# - similarity comparison (allow a few characters difference)
	if first_author_article == first_author_response:
	return doi


	# print 'did not match on anything. Likely wrong DOI'
	return False


	def doi_query1(article):
	data = {}
	data['q'] = article.original_title.encode('utf8') + ' '
	data['q'] += str(article.any_issn).encode('utf8') + ' '
	# data['q'] += ' ' + article.html_url.encode('utf8') + ' '
	data['year'] = str(article.publication_date)[0:4]
	return urllib.urlencode(data)

	def doi_query2(article):
	data = {}
	data['q'] = article.authors.encode('utf8') + ' '
	data['q'] += str(article.any_issn).encode('utf8') + ' '
	data['year'] = str(article.publication_date)[0:4]
	return urllib.urlencode(data)

	def search_doi(article):
	# do nothing if we have a DOI
	if type(article.doi) == str: return True

	response = urllib2.urlopen(CROSSREF_API_DOI + doi_query1(article)).read()
	response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()
	# print "first attempt:",
	if verify_doi(response_doi, article):
	article['doi'] = response_doi
	print 'found'
	return True

	# try a second query
	response = urllib2.urlopen(CROSSREF_API_DOI + doi_query2(article)).read()
	response_doi = json.loads(response)[0]['doi'].replace('http://dx.doi.org/', '').lower()

	if verify_doi(response_doi, article):
	article['doi'] = response_doi
	print 'found'
	return True

	return False