Last active
July 3, 2019 16:00
-
-
Save adjam/052cb2e4791e7caf2ba78c0dc7994d62 to your computer and use it in GitHub Desktop.
Example for merging item records spread out over multiple MARCXML into a single record.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Merges items spread over multiple MARC XML records | |
# into a single record. Some ILSes will not export MARC21 | |
# that is "too large", and will output the same bibliographic record | |
# multiple times. | |
# usage: first, convert your marc21 to MARCXML (see marcrenaissance.sh gist) | |
# ensure that all records with the same control number come out in a 'clump', | |
# e.g. by sorting on control number | |
# reads from STDIN and outputs to STDOUT, so, e.g. | |
# ./item_joiner.py < multirecord.xml > joined.xml | |
# reads from 'multirecord.xml' and outputs to `joined.xml`. | |
# Check the id_expr and item_expr to make sure they're appropriate to your | |
# records | |
from lxml import etree | |
import sys | |
nsuri = "http://www.loc.gov/MARC21/slim" | |
mns = {'marc': nsuri} | |
nonens = {None: nsuri} | |
# NCSU keeps local control number in the 918$a; adjust as appropriate | |
id_expr = "marc:datafield[@tag='918']/marc:subfield[@code='a']" | |
# As above, you may need to adjust if you use the 949, e.g. | |
item_expr = "marc:datafield[@tag='999']" | |
def serialize(rec): | |
return etree.tostring(rec, encoding="utf-8") | |
def extract_id(rec): | |
return rec.xpath(id_expr, namespaces=mns)[0].text | |
def merge_items(to_rec, from_rec): | |
items = from_rec.xpath(item_expr, namespaces=nonens) | |
for i in items: | |
to_rec.append(i) | |
if items is None: | |
raise ValueError(etree.tostring(from_rec)) | |
def main(infile, outfile): | |
current_id = None | |
current_rec = None | |
with etree.xmlfile(outfile, encoding="utf-8", buffered=True) as xf: | |
with xf.element("{%s}collection" % nsuri, nsmap=nonens) as ew: | |
for evt, rec in etree.iterparse(sys.stdin, events=('end',), tag='{%s}record' % nsuri): | |
this_id = extract_id(rec) | |
if this_id is None: | |
raise ValueError("missing catkey on record") | |
if this_id != current_id: | |
if current_rec is not None: | |
ew.write(current_rec) | |
current_rec = rec | |
urrent_id = this_id | |
else: | |
merge_items(current_rec, rec) | |
xf.write(current_rec) | |
if __name__ == '__main__': | |
main(sys.stdin, sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment