Skip to content

Instantly share code, notes, and snippets.

@hornc
Last active June 6, 2018 21:06
Show Gist options
  • Save hornc/7f6b2e42e3576bc242d9ad9d8d2039a3 to your computer and use it in GitHub Desktop.
Save hornc/7f6b2e42e3576bc242d9ad9d8d2039a3 to your computer and use it in GitHub Desktop.
OL Author Matching
from olclient.openlibrary import OpenLibrary
import re
import web
import urllib
import json
ol = OpenLibrary()
# testing OL code to determine existing author matches
# taking old code form
# https://github.com/internetarchive/openlibrary/blob/f8092840a77c7479a352fd83fd068d340f97d3e3/openlibrary/catalog/add_book/load_book.py
# and
# https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/importer/load.py
def do_flip(author):
# given an author name flip it in place
if 'personal_name' not in author:
return
if author['personal_name'] != author['name']:
return
first_comma = author['name'].find(', ')
if first_comma == -1:
return
# e.g: Harper, John Murdoch, 1845-
if author['name'].find(',', first_comma + 1) != -1:
return
if author['name'].find('i.e.') != -1:
return
if author['name'].find('i. e.') != -1:
return
name = flip_name(author['name'])
author['name'] = name
author['personal_name'] = name
re_year = re.compile(r'\b(\d{4})\b')
# taken from: https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/utils/__init__.py
def author_dates_match(a, b):
# check if the dates of two authors
for k in ['birth_date', 'death_date', 'date']:
if k not in a or a[k] is None or k not in b or b[k] is None:
continue
if a[k] == b[k] or a[k].startswith(b[k]) or b[k].startswith(a[k]):
continue
m1 = re_year.search(a[k])
if not m1:
return False
m2 = re_year.search(b[k])
if m2 and m1.group(1) == m2.group(1):
continue
return False
return True
# take from OL catalog/utils
# rec: {'key': '/author/OL1234A'}
# returns: 1234
def key_int(rec):
# extract the number from a key like /a/OL1234A
return int(web.numify(rec['key']))
# This is my (new) attempt to sort matched records, uses ad-hoc weightings which will need to be tuned experimentally
def match_sort(rec):
# Lowest score is better match
sort_score = (len(rec['name']) * 10 + # Shorter names are better
key_int(rec)/10000.0 - # Lower OLID numbers are slightly better, ad-hoc weighting!
(rec['work_count'] * 5)) # More works is better
return sort_score
# taken from catalog/importer/load.py
def pick_from_matches(author, match):
maybe = []
if 'birth_date' in author and 'death_date' in author:
maybe = [m for m in match if 'birth_date' in m and 'death_date' in m]
elif 'date' in author:
maybe = [m for m in match if 'date' in m]
if not maybe:
maybe = match
if len(maybe) == 1:
return maybe[0]
#return min(maybe, key=key_int)
return min(maybe, key=match_sort)
# name: string
# returns: list of authors [dict]
def find_author(name, api='autocomplete'):
q = {"type": "/type/author", "name": name, "limit": 100}
#reply = list(ol.query(q))
# above queries ol API /query.json?
# ol-client uses the autocomplete API: /authors/_autocomplete?q=%s&limit=%s
#authors = [ol.get(k) for k in reply]
if api == 'autocomplete':
authors = ol.Author.search(name, limit=100)
else: # api == 'query'
url = ol.base_url + '/query.json?' + urllib.urlencode(dict(query=json.dumps(q)))
response = ol.session.get(url)
authors = response.json()
return authors
# author: author object (dict)
# api: which API to query, (string) "autocomplete" | "query"
# full_load: whether to fully load each item in the results list (currently makes no difference to match quality)
# returns: a single author object, determined to be the 'best' match, if one is found, None otherwise
# previously named find_entity()
def import_author_match(author, api='autocomplete', full_load=False):
name = author['name']
things = find_author(name, api=api)
# things is a [dict] atm
et = author.get('entity_type') # unsure how well this fn.s, need to test
if et and et != 'person':
if not things:
return None
db_entity = things[0]
assert db_entity['type']['key'] == '/type/author'
return db_entity
if ', ' in name: # unsure how well this functions, need to test
things += find_author(flip_name(name))
match = []
seen = set()
if full_load: # load full data for each thing
things = [ol.Author.get(t['key'].replace('/authors/', '')).json() for t in things]
for a in things:
try:
key = a['key'] # for dicts
except TypeError:
key = a.olid # for ol.Author objects
if key in seen:
continue
seen.add(key)
orig_key = key
if api == 'autocomplete' and not full_load:
# query api results do not give type as it is already filtered on type=author
# for that matter, so are autocomplete results
# this check only makes sense if we have loaded items to check whether they are currently redirects or deletes
assert a['type'] == 'author'
if 'birth_date' in author and 'birth_date' not in a:
continue
# Commenting out this code to allow test_name_only_autocomplete() to pass
#if 'birth_date' not in author and 'birth_date' in a:
# continue
if not author_dates_match(author, a):
continue
match.append(a)
if not match:
return None
if len(match) == 1:
return match[0]
try:
return pick_from_matches(author, match)
except ValueError:
print 'author:', author
print 'match:', match
raise
# -*- coding: utf-8 -*-
import importauthormatch as iam
def test_find_author_autocomplete():
r = iam.find_author('Mark Twain', api='autocomplete')
assert len(r) == 14
print(r[0]['key'])
# autocomplete has more results, and more data
def test_find_author_query():
r = iam.find_author('Mark Twain', api='query')
assert len(r) == 3
print(r[0]['key'])
# query has less results, and only returns fields in the query
# both return a list of dicts
def test_basic():
author = {'name': 'Mark Twain', 'birth_date': '30 November 1835', 'death_date': '21 April 1910'}
result = iam.import_author_match(author, api='autocomplete')
print(result)
assert result['key'] == '/authors/OL18319A'
def test_basic_simpledates():
author = {'name': 'Mark Twain', 'birth_date': '1835', 'death_date': '1910'}
result = iam.import_author_match(author, api='autocomplete')
assert result['key'] == '/authors/OL18319A'
def test_basic_onedate():
author = {'name': 'Mark Twain', 'birth_date': '1835'}
result = iam.import_author_match(author, api='autocomplete')
assert result['key'] == '/authors/OL18319A'
def test_name_only_autocomplete():
author = {'name': 'Mark Twain'}
result = iam.import_author_match(author, api='autocomplete', full_load=False)
print(result)
print("Got: %s" % result['name'])
assert result['key'] == '/authors/OL18319A'
# the previous ImportBot code deliberately fails to match an target without dates to one that does have??? Strange
# I have changed it here so that it does find the expected match
# Query API is not good at matching by name only, deprecate it.
def xtest_name_only_query():
author = {'name': 'Mark Twain'}
result = iam.import_author_match(author, api='query')
print(result)
print("Got: %s" % result)
assert result['key'] == '/authors/OL18319A'
def test_not_found():
target = {'name': 'QwertysNotAname'}
a = iam.import_author_match(target, api='autocomplete')
b = iam.import_author_match(target, api='query')
assert a is None
assert b is None
def test_range_of_authors():
names = ['Plato', 'Sappho', 'Jean Paul Sartre', 'Michel Foucault', 'Enid Blyton', 'J. K. Rowling', 'Joanne K. Rowling']
for n in names:
target = {'name': n}
a = iam.import_author_match(target, api='autocomplete')
#b = iam.import_author_match(target, api='query')
print("%s:" % n)
print(" Autocomplete: %s" % a)
#print(" Query: %s" % b)
# Autocomplete tends to match longer names with lower ids first...
def test_long_name_matching():
target = {'name': 'Plato'}
a = iam.import_author_match(target, api='autocomplete')
assert a['key'] == '/authors/OL189658A' # Classical author Plato
longer = {'name': 'Platonov'}
b = iam.import_author_match(longer, api='autocomplete')
assert b['key'] == '/authors/OL30647A' # Russian author, Andreĭ Platonovich Platonov, 121 Works
@tfmorris
Copy link

tfmorris commented Jun 6, 2018

It seems to me that a distance or similarity metric with some range would provide more power than a binary match/no-match. This would allow you to give bonus points for matching both birth & death dates as well as extra credit for exact date match vs year match.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment