hornc · June 6, 2018 21:06 · tfmorris · Jun 6, 2018
diff --git a/importauthormatch.py b/importauthormatch.py
 from olclient.openlibrary import OpenLibrary
 import re
 import web
 import urllib
 import json
 ol = OpenLibrary()

 # testing OL code to determine existing author matches
 # taking old code form 
 # https://github.com/internetarchive/openlibrary/blob/f8092840a77c7479a352fd83fd068d340f97d3e3/openlibrary/catalog/add_book/load_book.py
 # and
 # https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/importer/load.py


 def do_flip(author):
    # given an author name flip it in place
    if 'personal_name' not in author:
        return
    if author['personal_name'] != author['name']:
        return
    first_comma = author['name'].find(', ')
    if first_comma == -1:
        return
    # e.g: Harper, John Murdoch, 1845-
    if author['name'].find(',', first_comma + 1) != -1:
        return
    if author['name'].find('i.e.') != -1:
        return
    if author['name'].find('i. e.') != -1:
        return
    name = flip_name(author['name'])
    author['name'] = name
    author['personal_name'] = name

 re_year = re.compile(r'\b(\d{4})\b')

 # taken from: https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/utils/__init__.py
 def author_dates_match(a, b):
    # check if the dates of two authors
    for k in ['birth_date', 'death_date', 'date']:
        if k not in a or a[k] is None or k not in b or b[k] is None:
            continue
        if a[k] == b[k] or a[k].startswith(b[k]) or b[k].startswith(a[k]):
            continue
        m1 = re_year.search(a[k])
        if not m1:
            return False
        m2 = re_year.search(b[k])
        if m2 and m1.group(1) == m2.group(1):
            continue
        return False
    return True

 # take from OL catalog/utils
 # rec: {'key': '/author/OL1234A'}
 # returns: 1234
 def key_int(rec):
    # extract the number from a key like /a/OL1234A
    return int(web.numify(rec['key']))


 # This is my (new) attempt to sort matched records, uses ad-hoc weightings which will need to be tuned experimentally
 def match_sort(rec):
    # Lowest score is better match
    sort_score = (len(rec['name']) * 10 +   # Shorter names are better
                 key_int(rec)/10000.0 -   # Lower OLID numbers are slightly better, ad-hoc weighting!
                 (rec['work_count'] * 5)) # More works is better
    return sort_score

 # taken from catalog/importer/load.py
 def pick_from_matches(author, match):
    maybe = []
    if 'birth_date' in author and 'death_date' in author:
        maybe = [m for m in match if 'birth_date' in m and 'death_date' in m]
    elif 'date' in author:
        maybe = [m for m in match if 'date' in m]
    if not maybe:
        maybe = match
    if len(maybe) == 1:
        return maybe[0]
    #return min(maybe, key=key_int)
    return min(maybe, key=match_sort)

 # name: string
 # returns: list of authors [dict]
 def find_author(name, api='autocomplete'):
    q = {"type": "/type/author", "name": name, "limit": 100}
    #reply = list(ol.query(q))
    #  above queries ol API /query.json?
    #    ol-client uses the autocomplete API: /authors/_autocomplete?q=%s&limit=%s
    #authors = [ol.get(k) for k in reply]
    if api == 'autocomplete':
        authors = ol.Author.search(name, limit=100)
    else: # api == 'query'
        url = ol.base_url + '/query.json?' + urllib.urlencode(dict(query=json.dumps(q)))
        response = ol.session.get(url)
        authors = response.json() 
    return authors


 # author: author object (dict)
 # api: which API to query, (string) "autocomplete" | "query"
 # full_load: whether to fully load each item in the results list (currently makes no difference to match quality)
 # returns: a single author object, determined to be the 'best' match, if one is found, None otherwise
 # previously named find_entity()
 def import_author_match(author, api='autocomplete', full_load=False):
    name = author['name']
    things = find_author(name, api=api)
    # things is a [dict] atm

    et = author.get('entity_type') # unsure how well this fn.s, need to test
    if et and et != 'person':
        if not things:
            return None
        db_entity = things[0]
        assert db_entity['type']['key'] == '/type/author'
        return db_entity
    if ', ' in name: # unsure how well this functions, need to test
        things += find_author(flip_name(name))
    match = []
    seen = set()
    if full_load: # load full data for each thing
        things = [ol.Author.get(t['key'].replace('/authors/', '')).json() for t in things]
    for a in things:
        try:
            key = a['key'] # for dicts
        except TypeError:
            key = a.olid # for ol.Author objects
        if key in seen:
            continue
        seen.add(key)
        orig_key = key
        if api == 'autocomplete' and not full_load:
            # query api results do not give type as it is already filtered on type=author
            # for that matter, so are autocomplete results
            # this check only makes sense if we have loaded items to check whether they are currently redirects or deletes
            assert a['type'] == 'author'
        if 'birth_date' in author and 'birth_date' not in a:
            continue
        # Commenting out this code to allow test_name_only_autocomplete() to pass
        #if 'birth_date' not in author and 'birth_date' in a:
        #    continue
        if not author_dates_match(author, a):
            continue
        match.append(a)
    if not match:
        return None
    if len(match) == 1:
        return match[0]
    try:
        return pick_from_matches(author, match)
    except ValueError:
        print 'author:', author
        print 'match:', match
        raise
diff --git a/test_importauthormatch.py b/test_importauthormatch.py
 # -*- coding: utf-8 -*-
 import importauthormatch as iam

 def test_find_author_autocomplete():
    r = iam.find_author('Mark Twain', api='autocomplete')
    assert len(r) == 14
    print(r[0]['key'])
    # autocomplete has more results, and more data

 def test_find_author_query():
    r = iam.find_author('Mark Twain', api='query')
    assert len(r) == 3
    print(r[0]['key'])
    # query has less results, and only returns fields in the query
    # both return a list of dicts

 def test_basic():
    author = {'name': 'Mark Twain', 'birth_date': '30 November 1835', 'death_date': '21 April 1910'}
    result = iam.import_author_match(author, api='autocomplete')
    print(result)
    assert result['key'] == '/authors/OL18319A' 

 def test_basic_simpledates():
    author = {'name': 'Mark Twain', 'birth_date': '1835', 'death_date': '1910'}
    result = iam.import_author_match(author, api='autocomplete')
    assert result['key'] == '/authors/OL18319A' 

 def test_basic_onedate():
    author = {'name': 'Mark Twain', 'birth_date': '1835'}
    result = iam.import_author_match(author, api='autocomplete')
    assert result['key'] == '/authors/OL18319A' 

 def test_name_only_autocomplete():
    author = {'name': 'Mark Twain'}
    result = iam.import_author_match(author, api='autocomplete', full_load=False)
    print(result)
    print("Got: %s" % result['name'])
    assert result['key'] == '/authors/OL18319A' 
    # the previous ImportBot code deliberately fails to match an target without dates to one that does have??? Strange
    # I have changed it here so that it does find the expected match

 # Query API is not good at matching by name only, deprecate it.
 def xtest_name_only_query():
    author = {'name': 'Mark Twain'}
    result = iam.import_author_match(author, api='query')
    print(result)
    print("Got: %s" % result)
    assert result['key'] == '/authors/OL18319A' 

 def test_not_found():
    target = {'name': 'QwertysNotAname'}
    a = iam.import_author_match(target, api='autocomplete')
    b = iam.import_author_match(target, api='query')
    assert a is None
    assert b is None

 def test_range_of_authors():
    names = ['Plato', 'Sappho', 'Jean Paul Sartre', 'Michel Foucault', 'Enid Blyton', 'J. K. Rowling', 'Joanne K. Rowling']
    for n in names:
        target = {'name': n}
        a = iam.import_author_match(target, api='autocomplete')
        #b = iam.import_author_match(target, api='query')
        print("%s:" % n)
        print(" Autocomplete: %s" % a)
        #print(" Query: %s" % b)

 # Autocomplete tends to match longer names with lower ids first...
 def test_long_name_matching():
    target = {'name': 'Plato'}
    a = iam.import_author_match(target, api='autocomplete')
    assert a['key'] == '/authors/OL189658A' # Classical author Plato

    longer = {'name': 'Platonov'}
    b = iam.import_author_match(longer, api='autocomplete')
    assert b['key'] == '/authors/OL30647A' # Russian author, Andreĭ Platonovich Platonov, 121 Works
	from olclient.openlibrary import OpenLibrary
	import re
	import web
	import urllib
	import json
	ol = OpenLibrary()

	# testing OL code to determine existing author matches
	# taking old code form
	# https://github.com/internetarchive/openlibrary/blob/f8092840a77c7479a352fd83fd068d340f97d3e3/openlibrary/catalog/add_book/load_book.py
	# and
	# https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/importer/load.py


	def do_flip(author):
	# given an author name flip it in place
	if 'personal_name' not in author:
	return
	if author['personal_name'] != author['name']:
	return
	first_comma = author['name'].find(', ')
	if first_comma == -1:
	return
	# e.g: Harper, John Murdoch, 1845-
	if author['name'].find(',', first_comma + 1) != -1:
	return
	if author['name'].find('i.e.') != -1:
	return
	if author['name'].find('i. e.') != -1:
	return
	name = flip_name(author['name'])
	author['name'] = name
	author['personal_name'] = name

	re_year = re.compile(r'\b(\d{4})\b')

	# taken from: https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/utils/__init__.py
	def author_dates_match(a, b):
	# check if the dates of two authors
	for k in ['birth_date', 'death_date', 'date']:
	if k not in a or a[k] is None or k not in b or b[k] is None:
	continue
	if a[k] == b[k] or a[k].startswith(b[k]) or b[k].startswith(a[k]):
	continue
	m1 = re_year.search(a[k])
	if not m1:
	return False
	m2 = re_year.search(b[k])
	if m2 and m1.group(1) == m2.group(1):
	continue
	return False
	return True

	# take from OL catalog/utils
	# rec: {'key': '/author/OL1234A'}
	# returns: 1234
	def key_int(rec):
	# extract the number from a key like /a/OL1234A
	return int(web.numify(rec['key']))


	# This is my (new) attempt to sort matched records, uses ad-hoc weightings which will need to be tuned experimentally
	def match_sort(rec):
	# Lowest score is better match
	sort_score = (len(rec['name']) * 10 + # Shorter names are better
	key_int(rec)/10000.0 - # Lower OLID numbers are slightly better, ad-hoc weighting!
	(rec['work_count'] * 5)) # More works is better
	return sort_score

	# taken from catalog/importer/load.py
	def pick_from_matches(author, match):
	maybe = []
	if 'birth_date' in author and 'death_date' in author:
	maybe = [m for m in match if 'birth_date' in m and 'death_date' in m]
	elif 'date' in author:
	maybe = [m for m in match if 'date' in m]
	if not maybe:
	maybe = match
	if len(maybe) == 1:
	return maybe[0]
	#return min(maybe, key=key_int)
	return min(maybe, key=match_sort)

	# name: string
	# returns: list of authors [dict]
	def find_author(name, api='autocomplete'):
	q = {"type": "/type/author", "name": name, "limit": 100}
	#reply = list(ol.query(q))
	# above queries ol API /query.json?
	# ol-client uses the autocomplete API: /authors/_autocomplete?q=%s&limit=%s
	#authors = [ol.get(k) for k in reply]
	if api == 'autocomplete':
	authors = ol.Author.search(name, limit=100)
	else: # api == 'query'
	url = ol.base_url + '/query.json?' + urllib.urlencode(dict(query=json.dumps(q)))
	response = ol.session.get(url)
	authors = response.json()
	return authors


	# author: author object (dict)
	# api: which API to query, (string) "autocomplete" \| "query"
	# full_load: whether to fully load each item in the results list (currently makes no difference to match quality)
	# returns: a single author object, determined to be the 'best' match, if one is found, None otherwise
	# previously named find_entity()
	def import_author_match(author, api='autocomplete', full_load=False):
	name = author['name']
	things = find_author(name, api=api)
	# things is a [dict] atm

	et = author.get('entity_type') # unsure how well this fn.s, need to test
	if et and et != 'person':
	if not things:
	return None
	db_entity = things[0]
	assert db_entity['type']['key'] == '/type/author'
	return db_entity
	if ', ' in name: # unsure how well this functions, need to test
	things += find_author(flip_name(name))
	match = []
	seen = set()
	if full_load: # load full data for each thing
	things = [ol.Author.get(t['key'].replace('/authors/', '')).json() for t in things]
	for a in things:
	try:
	key = a['key'] # for dicts
	except TypeError:
	key = a.olid # for ol.Author objects
	if key in seen:
	continue
	seen.add(key)
	orig_key = key
	if api == 'autocomplete' and not full_load:
	# query api results do not give type as it is already filtered on type=author
	# for that matter, so are autocomplete results
	# this check only makes sense if we have loaded items to check whether they are currently redirects or deletes
	assert a['type'] == 'author'
	if 'birth_date' in author and 'birth_date' not in a:
	continue
	# Commenting out this code to allow test_name_only_autocomplete() to pass
	#if 'birth_date' not in author and 'birth_date' in a:
	# continue
	if not author_dates_match(author, a):
	continue
	match.append(a)
	if not match:
	return None
	if len(match) == 1:
	return match[0]
	try:
	return pick_from_matches(author, match)
	except ValueError:
	print 'author:', author
	print 'match:', match
	raise
	# -- coding: utf-8 --
	import importauthormatch as iam

	def test_find_author_autocomplete():
	r = iam.find_author('Mark Twain', api='autocomplete')
	assert len(r) == 14
	print(r[0]['key'])
	# autocomplete has more results, and more data

	def test_find_author_query():
	r = iam.find_author('Mark Twain', api='query')
	assert len(r) == 3
	print(r[0]['key'])
	# query has less results, and only returns fields in the query
	# both return a list of dicts

	def test_basic():
	author = {'name': 'Mark Twain', 'birth_date': '30 November 1835', 'death_date': '21 April 1910'}
	result = iam.import_author_match(author, api='autocomplete')
	print(result)
	assert result['key'] == '/authors/OL18319A'

	def test_basic_simpledates():
	author = {'name': 'Mark Twain', 'birth_date': '1835', 'death_date': '1910'}
	result = iam.import_author_match(author, api='autocomplete')
	assert result['key'] == '/authors/OL18319A'

	def test_basic_onedate():
	author = {'name': 'Mark Twain', 'birth_date': '1835'}
	result = iam.import_author_match(author, api='autocomplete')
	assert result['key'] == '/authors/OL18319A'

	def test_name_only_autocomplete():
	author = {'name': 'Mark Twain'}
	result = iam.import_author_match(author, api='autocomplete', full_load=False)
	print(result)
	print("Got: %s" % result['name'])
	assert result['key'] == '/authors/OL18319A'
	# the previous ImportBot code deliberately fails to match an target without dates to one that does have??? Strange
	# I have changed it here so that it does find the expected match

	# Query API is not good at matching by name only, deprecate it.
	def xtest_name_only_query():
	author = {'name': 'Mark Twain'}
	result = iam.import_author_match(author, api='query')
	print(result)
	print("Got: %s" % result)
	assert result['key'] == '/authors/OL18319A'

	def test_not_found():
	target = {'name': 'QwertysNotAname'}
	a = iam.import_author_match(target, api='autocomplete')
	b = iam.import_author_match(target, api='query')
	assert a is None
	assert b is None

	def test_range_of_authors():
	names = ['Plato', 'Sappho', 'Jean Paul Sartre', 'Michel Foucault', 'Enid Blyton', 'J. K. Rowling', 'Joanne K. Rowling']
	for n in names:
	target = {'name': n}
	a = iam.import_author_match(target, api='autocomplete')
	#b = iam.import_author_match(target, api='query')
	print("%s:" % n)
	print(" Autocomplete: %s" % a)
	#print(" Query: %s" % b)

	# Autocomplete tends to match longer names with lower ids first...
	def test_long_name_matching():
	target = {'name': 'Plato'}
	a = iam.import_author_match(target, api='autocomplete')
	assert a['key'] == '/authors/OL189658A' # Classical author Plato

	longer = {'name': 'Platonov'}
	b = iam.import_author_match(longer, api='autocomplete')
	assert b['key'] == '/authors/OL30647A' # Russian author, Andreĭ Platonovich Platonov, 121 Works