adjam · July 3, 2019 16:00
diff --git a/item_joiner.py b/item_joiner.py
 #!/usr/bin/env python

 # Merges items spread over multiple MARC XML records
 # into a single record.  Some ILSes will not export MARC21
 # that is "too large", and will output the same bibliographic record
 # multiple times.

 # usage: first, convert your marc21 to MARCXML (see marcrenaissance.sh gist)
 # ensure that all records with the same control number come out in a 'clump',
 # e.g. by sorting on control number

 # reads from STDIN and outputs to STDOUT, so, e.g.
 # ./item_joiner.py < multirecord.xml > joined.xml

 # reads from 'multirecord.xml' and outputs to `joined.xml`.

 # Check the id_expr and item_expr to make sure they're appropriate to your
 # records

 from lxml import etree
 import sys

 nsuri = "http://www.loc.gov/MARC21/slim"

 mns = {'marc': nsuri}
 nonens = {None: nsuri}

 # NCSU keeps local control number in the 918$a; adjust as appropriate
 id_expr = "marc:datafield[@tag='918']/marc:subfield[@code='a']"

 # As above, you may need to adjust if you use the 949, e.g.
 item_expr = "marc:datafield[@tag='999']"


 def serialize(rec):
    return etree.tostring(rec, encoding="utf-8")


 def extract_id(rec):
    return rec.xpath(id_expr, namespaces=mns)[0].text


 def merge_items(to_rec, from_rec):
    items = from_rec.xpath(item_expr, namespaces=nonens)
    for i in items:
        to_rec.append(i)
    if items is None:
        raise ValueError(etree.tostring(from_rec))


 def main(infile, outfile):
    current_id = None
    current_rec = None
    with etree.xmlfile(outfile, encoding="utf-8", buffered=True) as xf:
        with xf.element("{%s}collection" % nsuri, nsmap=nonens) as ew:
            for evt, rec in etree.iterparse(sys.stdin, events=('end',), tag='{%s}record' % nsuri):
                this_id = extract_id(rec)
            if this_id is None:
                raise ValueError("missing catkey on record")
            if this_id != current_id:
                if current_rec is not None:
                    ew.write(current_rec)
                current_rec = rec
                urrent_id = this_id
            else:
                merge_items(current_rec, rec)
            xf.write(current_rec)


 if __name__ == '__main__':
    main(sys.stdin, sys.stdout)
	#!/usr/bin/env python

	# Merges items spread over multiple MARC XML records
	# into a single record. Some ILSes will not export MARC21
	# that is "too large", and will output the same bibliographic record
	# multiple times.

	# usage: first, convert your marc21 to MARCXML (see marcrenaissance.sh gist)
	# ensure that all records with the same control number come out in a 'clump',
	# e.g. by sorting on control number

	# reads from STDIN and outputs to STDOUT, so, e.g.
	# ./item_joiner.py < multirecord.xml > joined.xml

	# reads from 'multirecord.xml' and outputs to `joined.xml`.

	# Check the id_expr and item_expr to make sure they're appropriate to your
	# records

	from lxml import etree
	import sys

	nsuri = "http://www.loc.gov/MARC21/slim"

	mns = {'marc': nsuri}
	nonens = {None: nsuri}

	# NCSU keeps local control number in the 918$a; adjust as appropriate
	id_expr = "marc:datafield[@tag='918']/marc:subfield[@code='a']"

	# As above, you may need to adjust if you use the 949, e.g.
	item_expr = "marc:datafield[@tag='999']"


	def serialize(rec):
	return etree.tostring(rec, encoding="utf-8")


	def extract_id(rec):
	return rec.xpath(id_expr, namespaces=mns)[0].text


	def merge_items(to_rec, from_rec):
	items = from_rec.xpath(item_expr, namespaces=nonens)
	for i in items:
	to_rec.append(i)
	if items is None:
	raise ValueError(etree.tostring(from_rec))


	def main(infile, outfile):
	current_id = None
	current_rec = None
	with etree.xmlfile(outfile, encoding="utf-8", buffered=True) as xf:
	with xf.element("{%s}collection" % nsuri, nsmap=nonens) as ew:
	for evt, rec in etree.iterparse(sys.stdin, events=('end',), tag='{%s}record' % nsuri):
	this_id = extract_id(rec)
	if this_id is None:
	raise ValueError("missing catkey on record")
	if this_id != current_id:
	if current_rec is not None:
	ew.write(current_rec)
	current_rec = rec
	urrent_id = this_id
	else:
	merge_items(current_rec, rec)
	xf.write(current_rec)


	if __name__ == '__main__':
	main(sys.stdin, sys.stdout)