Skip to content

Instantly share code, notes, and snippets.

@kschlottmann
Created May 29, 2020 00:25
Show Gist options
  • Save kschlottmann/3f5de1c1f6ec44b7599e265ebb9e5083 to your computer and use it in GitHub Desktop.
Save kschlottmann/3f5de1c1f6ec44b7599e265ebb9e5083 to your computer and use it in GitHub Desktop.
this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename
import xmltodict, json
from timeit import default_timer as timer
import os
import sys
import datetime
#this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename
#this function wraps the json in a dict with a record key, and casts it to an individual marcxml record
def write_marcxml_record(record):
tempdict = {}
tempdict['record'] = record
xmlrecord = xmltodict.unparse(tempdict, encoding='utf-8', pretty='True')
bib = parse001(search_tag('001', record["controlfield"]))
filename_marcxml = bib + '.xml'
os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull\\allRBML")
f = open(filename_marcxml, 'x', encoding='utf-8')
f.write(xmlrecord)
f.close()
os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull")
return True
#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
def search_tag(tagname, setOfFields):
return [element for element in setOfFields if element['@tag'] == tagname]
#this function checks for one of the RBML holdings
def check_holdings(oneOrMore852s):
#all rbml
desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']
for dictionaries in oneOrMore852s[0]['subfield']:
try:
if any(code in dictionaries['#text'] for code in desiredHoldings):
return True
except:
return False
#grab the bib from the 001
def parse001(recordSection):
return recordSection[0]['#text']
#this is the function that selects only RBML records
def record_handling(_, record):
if check_holdings(search_tag('852', record["datafield"])) == True:
write_marcxml_record(record)
return True
else:
return True
def main(file):
#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)
#for the full run, iterate across all data files
filenames = os.listdir("data")
start = timer()
print ("Running...")
for filename in filenames:
xmlfile = open(os.path.join("data", filename), 'rb')
print(filename)
print(datetime.datetime.now())
main(xmlfile)
end = timer()
print(end - start) # Time in seconds, e.g. 5.38091952400282
'''
#for testing
##define file or files to run across
#filename = 'data/extract-042_cleaned.xml'
#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
xmlfile = open(filename, 'rb')
main(xmlfile)
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment