Last active
January 10, 2024 16:20
-
-
Save bertsky/76365fc92d7476218a5d12549c83a840 to your computer and use it in GitHub Desktop.
dump METS files from an OAI harvest (metha-cat output after running metha-sync), with recursive METS downloads for multipart works
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
from lxml import etree as ET | |
from ocrd_models.constants import NAMESPACES | |
NAMESPACES['oai'] = "http://www.openarchives.org/OAI/2.0/" | |
for curie in NAMESPACES: | |
ET.register_namespace(curie, NAMESPACES[curie]) | |
# can be too large for in-memory parsing: | |
#tree = ET.ElementTree() | |
#tree.parse(sys.stdin, parser=ET.ETCompatXMLParser(encoding='utf-8', recover=True)) | |
# | |
#root = tree.getroot() # Records | |
#for record in root.iterfind('oai:record', NAMESPACES): | |
# so use incremental parsing instead: | |
def fast_iter(context, func, *args, **kwargs): | |
""" | |
http://lxml.de/parsing.html#modifying-the-tree | |
Based on Liza Daly's fast_iter | |
http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ | |
See also http://effbot.org/zone/element-iterparse.htm | |
""" | |
for _, elem in context: | |
func(elem, *args, **kwargs) | |
# It's safe to call clear() here because no descendants will be | |
# accessed | |
elem.clear() | |
# Also eliminate now-empty references from the root node to elem | |
for ancestor in elem.xpath('ancestor-or-self::*'): | |
while ancestor.getprevious() is not None: | |
del ancestor.getparent()[0] | |
del context | |
def write_mets(mets, name): | |
#mets.write(name + '.xml', encoding='utf-8', xml_declaration=True) | |
with open(name + '.xml', 'wb') as file: | |
mets = ET.tostring(mets, pretty_print=True, encoding='utf-8') | |
file.write(mets) | |
def download_mets(url): | |
session = Session() | |
retries = Retry(total=5, status_forcelist=[ | |
# only transient failures (probably too wide): | |
408, 409, 412, 417, 423, 424, 425, 426, 428, 429, 440, 500, 503, 504, 509, 529, 598, 599]) | |
adapter = HTTPAdapter(max_retries=retries) | |
session.mount('http://', adapter) | |
session.mount('https://', adapter) | |
response = session.get(url, timeout=3) | |
response.raise_for_status() | |
download = response.content | |
mets = ET.fromstring(download) | |
return mets | |
def dive_mets(mets, oai_id, level): | |
if not mets: | |
print("empty METS result for %s" % oai_id, file=sys.stderr) | |
return | |
if level > 2: | |
raise Exception("unexpected depth of METS recursion for %s", oai_id) | |
# assert oai_id == mets.find('./mets:dmdSec/mets:mdWrap/mets:xmlData/mods:mods/mods:recordInfo/mods:recordIdentifier[@source="http://digital.slub-dresden.de/oai/"]', namespaces=NAMESPACES).text | |
logmap = mets.find('./mets:structMap[@TYPE="LOGICAL"]', namespaces=NAMESPACES) | |
# as long as the deepest mets:div have mets:mptr, go recursive on them (up to 2 times) | |
mptrs = logmap.xpath('.//mets:div[not(mets:div)]/mets:mptr/@xlink:href', namespaces=NAMESPACES) | |
if len(mptrs): | |
assert mets.find('./mets:fileSec/mets:fileGrp[@USE="DEFAULT"]', namespaces=NAMESPACES) is None, mptrs | |
for url in mptrs: | |
mets = download_mets(url) | |
if not len(mets): | |
print("empty METS result for %s" % url, file=sys.stderr) | |
continue | |
oai_id = mets.find('./mets:dmdSec/mets:mdWrap[@MDTYPE="MODS"]/mets:xmlData/mods:mods/mods:recordInfo/mods:recordIdentifier[@source="http://digital.slub-dresden.de/oai/"]', namespaces=NAMESPACES).text | |
print("recursive %s" % oai_id) | |
dive_mets(mets, oai_id, level + 1) | |
else: | |
write_mets(mets, oai_id) | |
def process_record(record): | |
oai_id = record.find('oai:header/oai:identifier', NAMESPACES).text | |
print("processing %s" % oai_id) | |
mets = record.find('oai:metadata/mets:mets', NAMESPACES) | |
mets = ET.ElementTree(mets) | |
dive_mets(mets, oai_id, 0) | |
if len(sys.argv) > 1: | |
files = sys.argv[1:] | |
else: | |
files = [sys.stdin.buffer] | |
for file_ in files: | |
fast_iter(ET.iterparse(file_, | |
encoding='utf-8', | |
recover=True, | |
tag='{%s}record' % NAMESPACES['oai']), | |
process_record) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
now correctly handles recursive case:
mets:div
withmets:mptr
, then instead of writing the current METS, download each of the@href
METS and continue with thatmultivolume_work
→multivolume_work/volume
,periodical
→periodical/volume
,newspaper
→newspaper/year
→newspaper/month/day/issue