Last active
May 24, 2020 12:54
-
-
Save kschlottmann/bdc37d64687fcf3814c97861352e9717 to your computer and use it in GitHub Desktop.
Iterates over many large MARCXML collection files, and pulls out certain fields using xmltodict streaming, based on matching certain holdings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict, json | |
from timeit import default_timer as timer | |
import os | |
import sys | |
import datetime | |
#this script will interate over the entire CLIO corpus and return the leaders, 245$a s, and 035s for all RBML records | |
#this function returns a record | |
def handle_record(_, record): | |
print(record) | |
return True | |
#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag | |
def search_tag(tagname, setOfFields): | |
return [element for element in setOfFields if element['@tag'] == tagname] | |
#this function checks for one of the RBML holdings | |
def check_holdings(oneOrMore852s): | |
#all rbml | |
#desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx'] | |
#rbml archival only | |
desiredHoldings = ['clm', 'oral', 'rbms', 'uacl'] | |
#all archival | |
#desiredHoldings = ['clm', 'oral', 'rbms', 'uacl'] | |
for dictionaries in oneOrMore852s[0]['subfield']: | |
try: | |
if any(code in dictionaries['#text'] for code in desiredHoldings): | |
return True | |
except: | |
return False | |
def parse245(recordSection): | |
if len(recordSection) == 0: | |
return True | |
else: | |
#this checks for multiple subfields, by testing for a list in the subfield dict | |
#this will only return the first subfield, which for 245 is fine because all we need is sub a | |
if isinstance(recordSection[0]['subfield'], list): | |
return (str(recordSection[0]['subfield'][0]['#text'])) | |
else: | |
return recordSection[0]['subfield']['#text'] | |
def parse035(recordSection): | |
if len(recordSection) == 0: | |
return True | |
else: | |
for d in recordSection: | |
#test for case where 035 has multiple subfields | |
if isinstance(d['subfield'], list): | |
return "Check 035" | |
#test for CULASPC in the 035, to set Aspace flag | |
else: | |
if "CULASPC" in d['subfield']['#text']: | |
bib = d['subfield']['#text'] | |
return bib | |
def parse001(recordSection): | |
return recordSection[0]['#text'] | |
#this is the main function | |
def record_handling(_, record): | |
if check_holdings(search_tag('852', record["datafield"])) == True: | |
#format output: bib | leader | 245 | test for CULASPC | |
print(parse001(search_tag('001', record["controlfield"])),"|", str(record["leader"]),"|", parse245(search_tag('245', record["datafield"])),"|", parse035(search_tag('035', record["datafield"])), "|", filename) | |
return True | |
else: | |
return True | |
def main(file): | |
#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record | |
record = xmltodict.parse(file, item_depth=2, item_callback=record_handling) | |
#for the full run: | |
filenames = os.listdir("data") | |
start = timer() | |
print ("Running...") | |
for filename in filenames: | |
xmlfile = open(os.path.join("data", filename), 'rb') | |
print(filename) | |
print(datetime.datetime.now()) | |
#redirect stdout to utf-8 file, to avoid encoding issues when writing the output. there is surely a better way | |
original = sys.stdout | |
f = open("results.txt", "a", encoding="utf-8") | |
sys.stdout = f | |
main(xmlfile) | |
xmlfile.close() | |
sys.stdout = original | |
f.close() | |
end = timer() | |
print(end - start) # Time in seconds, e.g. 5.38091952400282 | |
''' | |
#for testing | |
##define file or files to run across | |
#filename = 'data/extract-042_cleaned.xml' | |
filename = 'data/extract-021.xml' | |
#filename = 'cleaningTesting/cleaned-083.xml' | |
#filename = 'cleaned-004.xml' | |
#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77 | |
xmlfile = open(filename, 'rb') | |
main(xmlfile) | |
''' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FILES="cleaning/*.xml " | |
for f in $FILES | |
do | |
echo "Processing $f file..." | |
# take action on each file. $f store current file name | |
tr -cd '\11\12\15\40-\176' < $f > ${f%.xml}_cleaned.xml | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FILES=data/* | |
for f in $FILES | |
do | |
echo "Processing $f file..." | |
# take action on each file. $f store current file name | |
xmllint --noout --stream $f | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment