kschlottmann · May 29, 2020 00:25
diff --git a/allRbml.py b/allRbml.py
 import xmltodict, json
 from timeit import default_timer as timer
 import os 
 import sys
 import datetime

 #this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename

 #this function wraps the json in a dict with a record key, and casts it to an individual marcxml record
 def write_marcxml_record(record):
    tempdict = {}
    tempdict['record'] = record
    xmlrecord = xmltodict.unparse(tempdict, encoding='utf-8', pretty='True')
    bib = parse001(search_tag('001', record["controlfield"]))
    filename_marcxml = bib + '.xml'
    os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull\\allRBML")
    f = open(filename_marcxml, 'x', encoding='utf-8')
    f.write(xmlrecord)
    f.close()
    os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull")
    return True

 #this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
 def search_tag(tagname, setOfFields):
    return [element for element in setOfFields if element['@tag'] == tagname]

 #this function checks for one of the RBML holdings
 def check_holdings(oneOrMore852s):
    #all rbml
    desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']

    for dictionaries in oneOrMore852s[0]['subfield']:
        try:
            if any(code in dictionaries['#text'] for code in desiredHoldings):
                return True
        except:
            return False

 #grab the bib from the 001
 def parse001(recordSection):
        return recordSection[0]['#text']

 #this is the function that selects only RBML records
 def record_handling(_, record):
 	if check_holdings(search_tag('852', record["datafield"])) == True:
 		write_marcxml_record(record)
 		return True
 	else:
 		return True

 def main(file):
 	#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
 	record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)


 #for the full run, iterate across all data files
 filenames = os.listdir("data")

 start = timer()
 print ("Running...")

 for filename in filenames:
    xmlfile = open(os.path.join("data", filename), 'rb')
    print(filename)
    print(datetime.datetime.now())
    main(xmlfile)

 end = timer()

 print(end - start) # Time in seconds, e.g. 5.38091952400282

 '''
 #for testing

 ##define file or files to run across
 #filename = 'data/extract-042_cleaned.xml'

 #open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
 xmlfile = open(filename, 'rb')

 main(xmlfile)
 '''
	import xmltodict, json
	from timeit import default_timer as timer
	import os
	import sys
	import datetime

	#this script will interate over the entire CLIO corpus and return all RBML records as individual marcxml records with bib id as filename

	#this function wraps the json in a dict with a record key, and casts it to an individual marcxml record
	def write_marcxml_record(record):
	tempdict = {}
	tempdict['record'] = record
	xmlrecord = xmltodict.unparse(tempdict, encoding='utf-8', pretty='True')
	bib = parse001(search_tag('001', record["controlfield"]))
	filename_marcxml = bib + '.xml'
	os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull\\allRBML")
	f = open(filename_marcxml, 'x', encoding='utf-8')
	f.write(xmlrecord)
	f.close()
	os.chdir("C:\\Users\\kevin\\Desktop\\cul\\clioFull")
	return True

	#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
	def search_tag(tagname, setOfFields):
	return [element for element in setOfFields if element['@tag'] == tagname]

	#this function checks for one of the RBML holdings
	def check_holdings(oneOrMore852s):
	#all rbml
	desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']

	for dictionaries in oneOrMore852s[0]['subfield']:
	try:
	if any(code in dictionaries['#text'] for code in desiredHoldings):
	return True
	except:
	return False

	#grab the bib from the 001
	def parse001(recordSection):
	return recordSection[0]['#text']

	#this is the function that selects only RBML records
	def record_handling(_, record):
	if check_holdings(search_tag('852', record["datafield"])) == True:
	write_marcxml_record(record)
	return True
	else:
	return True

	def main(file):
	#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
	record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)


	#for the full run, iterate across all data files
	filenames = os.listdir("data")

	start = timer()
	print ("Running...")

	for filename in filenames:
	xmlfile = open(os.path.join("data", filename), 'rb')
	print(filename)
	print(datetime.datetime.now())
	main(xmlfile)

	end = timer()

	print(end - start) # Time in seconds, e.g. 5.38091952400282

	'''
	#for testing

	##define file or files to run across
	#filename = 'data/extract-042_cleaned.xml'

	#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
	xmlfile = open(filename, 'rb')

	main(xmlfile)
	'''