kschlottmann · May 24, 2020 12:54
diff --git a/leader_search.py b/leader_search.py
 import xmltodict, json
 from timeit import default_timer as timer
 import os 
 import sys
 import datetime

 #this script will interate over the entire CLIO corpus and return the leaders, 245$a s, and 035s for all RBML records

 #this function returns a record
 def handle_record(_, record):
    print(record)
    return True

 #this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
 def search_tag(tagname, setOfFields):
    return [element for element in setOfFields if element['@tag'] == tagname]

 #this function checks for one of the RBML holdings
 def check_holdings(oneOrMore852s):
    #all rbml
    #desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']

    #rbml archival only
 	desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']

    #all archival
    #desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']
 	for dictionaries in oneOrMore852s[0]['subfield']:
 		try:
 			if any(code in dictionaries['#text'] for code in desiredHoldings):
 				return True
 		except:
 			return False

 def parse245(recordSection):
    if len(recordSection) == 0:
        return True
    else:
        #this checks for multiple subfields, by testing for a list in the subfield dict
        #this will only return the first subfield, which for 245 is fine because all we need is sub a
        if isinstance(recordSection[0]['subfield'], list):
            return (str(recordSection[0]['subfield'][0]['#text']))
        else:
            return recordSection[0]['subfield']['#text']

 def parse035(recordSection):
    if len(recordSection) == 0:
        return True
    else:
        for d in recordSection:
            #test for case where 035 has multiple subfields
            if isinstance(d['subfield'], list):
                return "Check 035"
            #test for CULASPC in the 035, to set Aspace flag
            else:
                if "CULASPC" in d['subfield']['#text']:
                    bib = d['subfield']['#text']
                    return bib

 def parse001(recordSection):
        return recordSection[0]['#text']

 #this is the main function
 def record_handling(_, record):
 	if check_holdings(search_tag('852', record["datafield"])) == True:
    #format output: bib | leader | 245 | test for CULASPC
 		print(parse001(search_tag('001', record["controlfield"])),"|", str(record["leader"]),"|", parse245(search_tag('245', record["datafield"])),"|", parse035(search_tag('035', record["datafield"])), "|", filename)
 		return True
 	else:
 		return True


 def main(file):
 	#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
 	record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)

 #for the full run:

 filenames = os.listdir("data")

 start = timer()
 print ("Running...")

 for filename in filenames:
 	xmlfile = open(os.path.join("data", filename), 'rb')
 	print(filename)
 	print(datetime.datetime.now())

 	#redirect stdout to utf-8 file, to avoid encoding issues when writing the output. there is surely a better way
 	original = sys.stdout
 	f = open("results.txt", "a", encoding="utf-8")
 	sys.stdout = f
 	main(xmlfile)
 	xmlfile.close()
 	sys.stdout = original
 	f.close()

 end = timer()

 print(end - start) # Time in seconds, e.g. 5.38091952400282

 '''

 #for testing

 ##define file or files to run across
 #filename = 'data/extract-042_cleaned.xml'
 filename = 'data/extract-021.xml'
 #filename = 'cleaningTesting/cleaned-083.xml'
 #filename = 'cleaned-004.xml'

 #open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
 xmlfile = open(filename, 'rb')

 main(xmlfile)
 '''
diff --git a/tr-clean.sh b/tr-clean.sh
 #!/bin/bash

 FILES="cleaning/*.xml "

 for f in $FILES
 do
  echo "Processing $f file..."
  # take action on each file. $f store current file name
  tr -cd '\11\12\15\40-\176' < $f > ${f%.xml}_cleaned.xml
 done 
diff --git a/xmllint.sh b/xmllint.sh
 #!/bin/bash

 FILES=data/*
 for f in $FILES
 do
  echo "Processing $f file..."
  # take action on each file. $f store current file name
  xmllint --noout --stream $f
 done
	import xmltodict, json
	from timeit import default_timer as timer
	import os
	import sys
	import datetime

	#this script will interate over the entire CLIO corpus and return the leaders, 245$a s, and 035s for all RBML records

	#this function returns a record
	def handle_record(_, record):
	print(record)
	return True

	#this function takes an area of the MARC record (controlfield or datafield) and returns all elements with the matching tag
	def search_tag(tagname, setOfFields):
	return [element for element in setOfFields if element['@tag'] == tagname]

	#this function checks for one of the RBML holdings
	def check_holdings(oneOrMore852s):
	#all rbml
	#desiredHoldings = ['bar,rare','clm', 'gax', 'glx,rare', 'oral', 'rbms', 'rbx', 'uacl', 'oral', 'rbi', 'rbx']

	#rbml archival only
	desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']

	#all archival
	#desiredHoldings = ['clm', 'oral', 'rbms', 'uacl']
	for dictionaries in oneOrMore852s[0]['subfield']:
	try:
	if any(code in dictionaries['#text'] for code in desiredHoldings):
	return True
	except:
	return False

	def parse245(recordSection):
	if len(recordSection) == 0:
	return True
	else:
	#this checks for multiple subfields, by testing for a list in the subfield dict
	#this will only return the first subfield, which for 245 is fine because all we need is sub a
	if isinstance(recordSection[0]['subfield'], list):
	return (str(recordSection[0]['subfield'][0]['#text']))
	else:
	return recordSection[0]['subfield']['#text']

	def parse035(recordSection):
	if len(recordSection) == 0:
	return True
	else:
	for d in recordSection:
	#test for case where 035 has multiple subfields
	if isinstance(d['subfield'], list):
	return "Check 035"
	#test for CULASPC in the 035, to set Aspace flag
	else:
	if "CULASPC" in d['subfield']['#text']:
	bib = d['subfield']['#text']
	return bib

	def parse001(recordSection):
	return recordSection[0]['#text']

	#this is the main function
	def record_handling(_, record):
	if check_holdings(search_tag('852', record["datafield"])) == True:
	#format output: bib \| leader \| 245 \| test for CULASPC
	print(parse001(search_tag('001', record["controlfield"])),"\|", str(record["leader"]),"\|", parse245(search_tag('245', record["datafield"])),"\|", parse035(search_tag('035', record["datafield"])), "\|", filename)
	return True
	else:
	return True


	def main(file):
	#xmltodict will stream each element at the 2nd level of the XML file, in this case collection/record
	record = xmltodict.parse(file, item_depth=2, item_callback=record_handling)

	#for the full run:

	filenames = os.listdir("data")

	start = timer()
	print ("Running...")

	for filename in filenames:
	xmlfile = open(os.path.join("data", filename), 'rb')
	print(filename)
	print(datetime.datetime.now())

	#redirect stdout to utf-8 file, to avoid encoding issues when writing the output. there is surely a better way
	original = sys.stdout
	f = open("results.txt", "a", encoding="utf-8")
	sys.stdout = f
	main(xmlfile)
	xmlfile.close()
	sys.stdout = original
	f.close()

	end = timer()

	print(end - start) # Time in seconds, e.g. 5.38091952400282

	'''

	#for testing

	##define file or files to run across
	#filename = 'data/extract-042_cleaned.xml'
	filename = 'data/extract-021.xml'
	#filename = 'cleaningTesting/cleaned-083.xml'
	#filename = 'cleaned-004.xml'

	#open the file, IN BINARY MODE see martinblech/xmltodict/issues/77
	xmlfile = open(filename, 'rb')

	main(xmlfile)
	'''
	#!/bin/bash

	FILES="cleaning/*.xml "

	for f in $FILES
	do
	echo "Processing $f file..."
	# take action on each file. $f store current file name
	tr -cd '\11\12\15\40-\176' < $f > ${f%.xml}_cleaned.xml
	done
	#!/bin/bash

	FILES=data/*
	for f in $FILES
	do
	echo "Processing $f file..."
	# take action on each file. $f store current file name
	xmllint --noout --stream $f
	done