Instantly share code, notes, and snippets.
Last active
June 6, 2018 21:06
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save hornc/7f6b2e42e3576bc242d9ad9d8d2039a3 to your computer and use it in GitHub Desktop.
OL Author Matching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from olclient.openlibrary import OpenLibrary | |
import re | |
import web | |
import urllib | |
import json | |
ol = OpenLibrary() | |
# testing OL code to determine existing author matches | |
# taking old code form | |
# https://github.com/internetarchive/openlibrary/blob/f8092840a77c7479a352fd83fd068d340f97d3e3/openlibrary/catalog/add_book/load_book.py | |
# and | |
# https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/importer/load.py | |
def do_flip(author): | |
# given an author name flip it in place | |
if 'personal_name' not in author: | |
return | |
if author['personal_name'] != author['name']: | |
return | |
first_comma = author['name'].find(', ') | |
if first_comma == -1: | |
return | |
# e.g: Harper, John Murdoch, 1845- | |
if author['name'].find(',', first_comma + 1) != -1: | |
return | |
if author['name'].find('i.e.') != -1: | |
return | |
if author['name'].find('i. e.') != -1: | |
return | |
name = flip_name(author['name']) | |
author['name'] = name | |
author['personal_name'] = name | |
re_year = re.compile(r'\b(\d{4})\b') | |
# taken from: https://github.com/internetarchive/openlibrary/blob/c4d877ee6410df6f70ab45718baebe52fdf366ba/openlibrary/catalog/utils/__init__.py | |
def author_dates_match(a, b): | |
# check if the dates of two authors | |
for k in ['birth_date', 'death_date', 'date']: | |
if k not in a or a[k] is None or k not in b or b[k] is None: | |
continue | |
if a[k] == b[k] or a[k].startswith(b[k]) or b[k].startswith(a[k]): | |
continue | |
m1 = re_year.search(a[k]) | |
if not m1: | |
return False | |
m2 = re_year.search(b[k]) | |
if m2 and m1.group(1) == m2.group(1): | |
continue | |
return False | |
return True | |
# take from OL catalog/utils | |
# rec: {'key': '/author/OL1234A'} | |
# returns: 1234 | |
def key_int(rec): | |
# extract the number from a key like /a/OL1234A | |
return int(web.numify(rec['key'])) | |
# This is my (new) attempt to sort matched records, uses ad-hoc weightings which will need to be tuned experimentally | |
def match_sort(rec): | |
# Lowest score is better match | |
sort_score = (len(rec['name']) * 10 + # Shorter names are better | |
key_int(rec)/10000.0 - # Lower OLID numbers are slightly better, ad-hoc weighting! | |
(rec['work_count'] * 5)) # More works is better | |
return sort_score | |
# taken from catalog/importer/load.py | |
def pick_from_matches(author, match): | |
maybe = [] | |
if 'birth_date' in author and 'death_date' in author: | |
maybe = [m for m in match if 'birth_date' in m and 'death_date' in m] | |
elif 'date' in author: | |
maybe = [m for m in match if 'date' in m] | |
if not maybe: | |
maybe = match | |
if len(maybe) == 1: | |
return maybe[0] | |
#return min(maybe, key=key_int) | |
return min(maybe, key=match_sort) | |
# name: string | |
# returns: list of authors [dict] | |
def find_author(name, api='autocomplete'): | |
q = {"type": "/type/author", "name": name, "limit": 100} | |
#reply = list(ol.query(q)) | |
# above queries ol API /query.json? | |
# ol-client uses the autocomplete API: /authors/_autocomplete?q=%s&limit=%s | |
#authors = [ol.get(k) for k in reply] | |
if api == 'autocomplete': | |
authors = ol.Author.search(name, limit=100) | |
else: # api == 'query' | |
url = ol.base_url + '/query.json?' + urllib.urlencode(dict(query=json.dumps(q))) | |
response = ol.session.get(url) | |
authors = response.json() | |
return authors | |
# author: author object (dict) | |
# api: which API to query, (string) "autocomplete" | "query" | |
# full_load: whether to fully load each item in the results list (currently makes no difference to match quality) | |
# returns: a single author object, determined to be the 'best' match, if one is found, None otherwise | |
# previously named find_entity() | |
def import_author_match(author, api='autocomplete', full_load=False): | |
name = author['name'] | |
things = find_author(name, api=api) | |
# things is a [dict] atm | |
et = author.get('entity_type') # unsure how well this fn.s, need to test | |
if et and et != 'person': | |
if not things: | |
return None | |
db_entity = things[0] | |
assert db_entity['type']['key'] == '/type/author' | |
return db_entity | |
if ', ' in name: # unsure how well this functions, need to test | |
things += find_author(flip_name(name)) | |
match = [] | |
seen = set() | |
if full_load: # load full data for each thing | |
things = [ol.Author.get(t['key'].replace('/authors/', '')).json() for t in things] | |
for a in things: | |
try: | |
key = a['key'] # for dicts | |
except TypeError: | |
key = a.olid # for ol.Author objects | |
if key in seen: | |
continue | |
seen.add(key) | |
orig_key = key | |
if api == 'autocomplete' and not full_load: | |
# query api results do not give type as it is already filtered on type=author | |
# for that matter, so are autocomplete results | |
# this check only makes sense if we have loaded items to check whether they are currently redirects or deletes | |
assert a['type'] == 'author' | |
if 'birth_date' in author and 'birth_date' not in a: | |
continue | |
# Commenting out this code to allow test_name_only_autocomplete() to pass | |
#if 'birth_date' not in author and 'birth_date' in a: | |
# continue | |
if not author_dates_match(author, a): | |
continue | |
match.append(a) | |
if not match: | |
return None | |
if len(match) == 1: | |
return match[0] | |
try: | |
return pick_from_matches(author, match) | |
except ValueError: | |
print 'author:', author | |
print 'match:', match | |
raise |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import importauthormatch as iam | |
def test_find_author_autocomplete(): | |
r = iam.find_author('Mark Twain', api='autocomplete') | |
assert len(r) == 14 | |
print(r[0]['key']) | |
# autocomplete has more results, and more data | |
def test_find_author_query(): | |
r = iam.find_author('Mark Twain', api='query') | |
assert len(r) == 3 | |
print(r[0]['key']) | |
# query has less results, and only returns fields in the query | |
# both return a list of dicts | |
def test_basic(): | |
author = {'name': 'Mark Twain', 'birth_date': '30 November 1835', 'death_date': '21 April 1910'} | |
result = iam.import_author_match(author, api='autocomplete') | |
print(result) | |
assert result['key'] == '/authors/OL18319A' | |
def test_basic_simpledates(): | |
author = {'name': 'Mark Twain', 'birth_date': '1835', 'death_date': '1910'} | |
result = iam.import_author_match(author, api='autocomplete') | |
assert result['key'] == '/authors/OL18319A' | |
def test_basic_onedate(): | |
author = {'name': 'Mark Twain', 'birth_date': '1835'} | |
result = iam.import_author_match(author, api='autocomplete') | |
assert result['key'] == '/authors/OL18319A' | |
def test_name_only_autocomplete(): | |
author = {'name': 'Mark Twain'} | |
result = iam.import_author_match(author, api='autocomplete', full_load=False) | |
print(result) | |
print("Got: %s" % result['name']) | |
assert result['key'] == '/authors/OL18319A' | |
# the previous ImportBot code deliberately fails to match an target without dates to one that does have??? Strange | |
# I have changed it here so that it does find the expected match | |
# Query API is not good at matching by name only, deprecate it. | |
def xtest_name_only_query(): | |
author = {'name': 'Mark Twain'} | |
result = iam.import_author_match(author, api='query') | |
print(result) | |
print("Got: %s" % result) | |
assert result['key'] == '/authors/OL18319A' | |
def test_not_found(): | |
target = {'name': 'QwertysNotAname'} | |
a = iam.import_author_match(target, api='autocomplete') | |
b = iam.import_author_match(target, api='query') | |
assert a is None | |
assert b is None | |
def test_range_of_authors(): | |
names = ['Plato', 'Sappho', 'Jean Paul Sartre', 'Michel Foucault', 'Enid Blyton', 'J. K. Rowling', 'Joanne K. Rowling'] | |
for n in names: | |
target = {'name': n} | |
a = iam.import_author_match(target, api='autocomplete') | |
#b = iam.import_author_match(target, api='query') | |
print("%s:" % n) | |
print(" Autocomplete: %s" % a) | |
#print(" Query: %s" % b) | |
# Autocomplete tends to match longer names with lower ids first... | |
def test_long_name_matching(): | |
target = {'name': 'Plato'} | |
a = iam.import_author_match(target, api='autocomplete') | |
assert a['key'] == '/authors/OL189658A' # Classical author Plato | |
longer = {'name': 'Platonov'} | |
b = iam.import_author_match(longer, api='autocomplete') | |
assert b['key'] == '/authors/OL30647A' # Russian author, Andreĭ Platonovich Platonov, 121 Works |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It seems to me that a distance or similarity metric with some range would provide more power than a binary match/no-match. This would allow you to give bonus points for matching both birth & death dates as well as extra credit for exact date match vs year match.