Skip to content

Instantly share code, notes, and snippets.

@kschlottmann
Last active May 24, 2020 12:54
Show Gist options
  • Save kschlottmann/bdc37d64687fcf3814c97861352e9717 to your computer and use it in GitHub Desktop.
Save kschlottmann/bdc37d64687fcf3814c97861352e9717 to your computer and use it in GitHub Desktop.
Iterates over many large MARCXML collection files, and pulls out certain fields using xmltodict streaming, based on matching certain holdings
import xmltodict, json
from timeit import default_timer as timer
import os
import sys
import datetime
#this script will interate over the entire CLIO corpus and return the leaders, 245$a s, and 035s for all RBML records
#this function returns a record
def handle_record(_, record):
print(record)
return True
#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
def search_tag(tagname, setOfFields):
return [element for element in setOfFields if element['@tag'] == tagname]
#this function checks for one of the RBML holdings
def check_holdings(oneOrMore852s):
#all rbml
#desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']
#rbml archival only
desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']
#all archival
#desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']
for dictionaries in oneOrMore852s[0]['subfield']:
try:
if any(code in dictionaries['#text'] for code in desiredHoldings):
return True
except:
return False
def parse245(recordSection):
if len(recordSection) == 0:
return True
else:
#this checks for multiple subfields, by testing for a list in the subfield dict
#this will only return the first subfield, which for 245 is fine because all we need is sub a
if isinstance(recordSection[0]['subfield'], list):
return (str(recordSection[0]['subfield'][0]['#text']))
else:
return recordSection[0]['subfield']['#text']
def parse035(recordSection):
if len(recordSection) == 0:
return True
else:
for d in recordSection:
#test for case where 035 has multiple subfields
if isinstance(d['subfield'], list):
return "Check 035"
#test for CULASPC in the 035, to set Aspace flag
else:
if "CULASPC" in d['subfield']['#text']:
bib = d['subfield']['#text']
return bib
def parse001(recordSection):
return recordSection[0]['#text']
#this is the main function
def record_handling(_, record):
if check_holdings(search_tag('852', record["datafield"])) == True:
#format output: bib | leader | 245 | test for CULASPC
print(parse001(search_tag('001', record["controlfield"])),"|", str(record["leader"]),"|", parse245(search_tag('245', record["datafield"])),"|", parse035(search_tag('035', record["datafield"])), "|", filename)
return True
else:
return True
def main(file):
#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)
#for the full run:
filenames = os.listdir("data")
start = timer()
print ("Running...")
for filename in filenames:
xmlfile = open(os.path.join("data", filename), 'rb')
print(filename)
print(datetime.datetime.now())
#redirect stdout to utf-8 file, to avoid encoding issues when writing the output. there is surely a better way
original = sys.stdout
f = open("results.txt", "a", encoding="utf-8")
sys.stdout = f
main(xmlfile)
xmlfile.close()
sys.stdout = original
f.close()
end = timer()
print(end - start) # Time in seconds, e.g. 5.38091952400282
'''
#for testing
##define file or files to run across
#filename = 'data/extract-042_cleaned.xml'
filename = 'data/extract-021.xml'
#filename = 'cleaningTesting/cleaned-083.xml'
#filename = 'cleaned-004.xml'
#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
xmlfile = open(filename, 'rb')
main(xmlfile)
'''
#!/bin/bash
FILES="cleaning/*.xml "
for f in $FILES
do
echo "Processing $f file..."
# take action on each file. $f store current file name
tr -cd '\11\12\15\40-\176' < $f > ${f%.xml}_cleaned.xml
done
#!/bin/bash
FILES=data/*
for f in $FILES
do
echo "Processing $f file..."
# take action on each file. $f store current file name
xmllint --noout --stream $f
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment