Skip to content

Instantly share code, notes, and snippets.

@hornc
Last active November 24, 2017 08:37
Show Gist options
  • Save hornc/32a1924a96e4183308a01ea20c72b536 to your computer and use it in GitHub Desktop.
Save hornc/32a1924a96e4183308a01ea20c72b536 to your computer and use it in GitHub Desktop.
from catharbot import catharbot
bot = catharbot.CatharBot()
# Work In Progress
# uses the 'merging' branch of Catharbot, https://github.com/hornc/catharbot/tree/merging
def extract_olid(olid):
"""Convert a string like '/authors/OL1412764A' to just 'OL1412764A'"""
return olid.split('/')[-1]
def remove_editions(duplicates, docs):
""" Removes editions by [OLID str] from a merge changeset docs ([JSON dicts])
reason: to prevent duplicated edition documents that merge_works() reassigns
and merge_editions() makes into redirects.
TODO: refactor to make this process clearer / cleaner
"""
return [ d for d in docs if extract_olid(d['key']) not in duplicates ]
def test_remove_editions():
dupes = ['A', 'C']
docs = [{'key': '/books/A'}, {'key': '/books/B'}, {'key': '/books/C'}]
assert(remove_editions(dupes, docs) == [{'key': '/books/B'}])
# TODO: change merge_works() and merge_editions() to use **kwargs
# e.g. merge_works(master="OL1234W", duplicates=["OL345W", "OL678W"])
# example changeset: https://openlibrary.org/recentchanges/2017/09/22/bulk_update/53034676
# full_merge(master='OL24869802M', duplicates=['OL25426847M', 'OL24928157M'])
def full_merge(**kwargs):
""" Merge identical editions and their works
kwargs:
master: Master edition OLID (str - required)
and one of
duplicate: Duplicate edtion OLID (str)
duplicates: list of edition OLIDs ([str])
simple: (bool) Do not merge data, just perform redirects, defaults to False
Simple merge is faster and can be used when merging 'bad' data into a good record.
"""
# TODO: this works, but refactor for clarity!
master = kwargs['master']
simple = kwargs.setdefault('simple', False)
duplicates = kwargs.setdefault('duplicates', [])
if 'duplicate' in kwargs:
duplicates.append(kwargs['duplicate'])
print "Merge %s into %s" % (duplicates, master)
master_edition = bot.load_doc(master)
dupe_editions = [ bot.load_doc(e) for e in duplicates ]
changeset = []
if not simple:
merged_edition = bot.merge_into_work(master_edition, dupe_editions)
changeset.append(merged_edition)
# are there extra works to merge?
master_w_olid = extract_olid(master_edition['works'][0]['key'])
dupe_w_olids = [ extract_olid(e['works'][0]['key']) for e in dupe_editions if extract_olid(e['works'][0]['key']) != master_w_olid ]
if len(dupe_w_olids) > 1 or master_w_olid not in dupe_w_olids:
master_work = bot.load_doc(master_w_olid)
dupe_works = [ bot.load_doc(w) for w in dupe_w_olids ]
if not simple:
merged_work = bot.merge_into_work(master_work, dupe_works)
changeset.append(merged_work)
changeset += bot.merge_works(dupe_w_olids, master_w_olid)
# remove reassigned duplicate editions from changeset that will be made into redirects
changeset = remove_editions(duplicates, changeset)
changeset += bot.merge_editions(duplicates, master)
return changeset
# move_editions(['OL559079M'], 'OL2420021W')
from olclient.openlibrary import OpenLibrary
ol = OpenLibrary()
def move_editions(edition_list, master_work):
changeset = []
for olid in edition_list:
edition = ol.Edition.get(olid)
edition.work_olid = master_work
changeset.append(edition)
ol.save_many(changeset, "move to work %s" % master_work)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment