Skip to content

Instantly share code, notes, and snippets.

@hornc
Last active September 21, 2017 12:21
Show Gist options
  • Save hornc/ee7914be72534d9d39025f0701b495e8 to your computer and use it in GitHub Desktop.
Save hornc/ee7914be72534d9d39025f0701b495e8 to your computer and use it in GitHub Desktop.
Example semi-automated bot merge
# Steps to merge a number of Kite Runner editions and works:
from catharbot import catharbot
bot = catharbot.CatharBot()
def merge_unique_lists(lists, hash_fn=None):
"""
Combine unique lists into a new unique list. Preserves ordering.
"""
result = []
seen = set()
for lst in lists:
for el in lst:
hsh = hash_fn(el) if hash_fn else el
if hsh not in seen:
result.append(el)
seen.add(hsh)
return result
orphan_olids = ['OL24057181M', 'OL24237802M']
master_olid = 'OL5782001W' # Chosen as master because: Most Editions, On the most Lists, on Staff Picks List
dupe_olids = [ 'OL5781992W',
'OL17174243W',
'OL16068777W',
'OL17348501W',
'OL16049475W'
]
# Move orphans to master work
orphans = [ bot.get_move_edition(e, master_olid) for e in orphan_olids ]
bot.save_many(orphans, "Associate with work")
master = bot.load_doc(master_olid)
edition_with_desc = bot.load_doc(orphan_olids[1]) # OL24237802M has a good description, use that on the master work
master['description'] = edition_with_desc['description']
master['title'] = "The Kite Runner" # Title case the master work
# Update lists on master from all duplicates...
for w in dupe_olids:
dupe = bot.load_doc(w)
for prop in ['subjects', 'covers', 'subject_places']:
master[prop] = merge_unique_lists([master.get(prop, []), dupe.get(prop, [])])
bot.save_one(master, "update master before merge")
bot.merge_works(dupe_olids, master_olid)
# These are the commands I used to merge
# https://openlibrary.org/works/OL7920112W?v=5 (dupe)
# and
# https://openlibrary.org/works/OL483502W/House_of_Sand_and_Fog?v=7 (master, because it was on the most lists. Lists don't like redirects)
# Uses my bot code, https://github.com/hornc/catharbot
# which requires the openlibrary-client, https://github.com/internetarchive/openlibrary-client/
#
from catharbot import catharbot
bot = catharbot.CatharBot()
def merge_subjects(a, b):
return list(set(a['subjects']) | set(b['subjects']))
def merge_covers(a, b):
return list(set(a['covers']) | set(b['covers']))
master_olid = "OL483502W"
dupe_olid = "OL7920112W"
master = bot.load_doc(master_olid)
dupe = bot.load_doc(dupe_olid)
master['title'] = dupe['title'] # The dupe title happened to be in proper title case
master['description'] = dupe['description'] # dupe had a description, master didn't
master['subjects'] = merge_subjects(master, dupe) # dupe only had one subject that the master already had, so no-op
master['covers'] = merge_covers(master, dupe) # dupe had one cover not on the master
bot.save_one(master, "update master before merge")
bot.merge_works([dupe_olid], master_olid) # This moves the dupe editions and makes the dupe work a redirect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment