Created
May 29, 2020 00:25
-
-
Save kschlottmann/3f5de1c1f6ec44b7599e265ebb9e5083 to your computer and use it in GitHub Desktop.
this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict, json | |
from timeit import default_timer as timer | |
import os | |
import sys | |
import datetime | |
#this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename | |
#this function wraps the json in a dict with a record key, and casts it to an individual marcxml record | |
def write_marcxml_record(record): | |
tempdict = {} | |
tempdict['record'] = record | |
xmlrecord = xmltodict.unparse(tempdict, encoding='utf-8', pretty='True') | |
bib = parse001(search_tag('001', record["controlfield"])) | |
filename_marcxml = bib + '.xml' | |
os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull\\allRBML") | |
f = open(filename_marcxml, 'x', encoding='utf-8') | |
f.write(xmlrecord) | |
f.close() | |
os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull") | |
return True | |
#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag | |
def search_tag(tagname, setOfFields): | |
return [element for element in setOfFields if element['@tag'] == tagname] | |
#this function checks for one of the RBML holdings | |
def check_holdings(oneOrMore852s): | |
#all rbml | |
desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx'] | |
for dictionaries in oneOrMore852s[0]['subfield']: | |
try: | |
if any(code in dictionaries['#text'] for code in desiredHoldings): | |
return True | |
except: | |
return False | |
#grab the bib from the 001 | |
def parse001(recordSection): | |
return recordSection[0]['#text'] | |
#this is the function that selects only RBML records | |
def record_handling(_, record): | |
if check_holdings(search_tag('852', record["datafield"])) == True: | |
write_marcxml_record(record) | |
return True | |
else: | |
return True | |
def main(file): | |
#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record | |
record = xmltodict.parse(file, item_depth=2, item_callback=record_handling) | |
#for the full run, iterate across all data files | |
filenames = os.listdir("data") | |
start = timer() | |
print ("Running...") | |
for filename in filenames: | |
xmlfile = open(os.path.join("data", filename), 'rb') | |
print(filename) | |
print(datetime.datetime.now()) | |
main(xmlfile) | |
end = timer() | |
print(end - start) # Time in seconds, e.g. 5.38091952400282 | |
''' | |
#for testing | |
##define file or files to run across | |
#filename = 'data/extract-042_cleaned.xml' | |
#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77 | |
xmlfile = open(filename, 'rb') | |
main(xmlfile) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment