Skip to content

Instantly share code, notes, and snippets.

@MrJeremyHobbs
Created February 28, 2020 20:58
Show Gist options
  • Save MrJeremyHobbs/39bfb2f8774b30b37c850f6b091e28ae to your computer and use it in GitHub Desktop.
Save MrJeremyHobbs/39bfb2f8774b30b37c850f6b091e28ae to your computer and use it in GitHub Desktop.
PyMARC clean-up example
#!/usr/bin/python3
import os
import logging
from pymarc import MARCReader
from tqdm import tqdm
# clean-up
try:
os.system('cls')
os.system('del errors.log')
os.remove('output') # del folder with old logs
os.mkdir('output') # re-create dir
except:
pass
# logging
logging.basicConfig(filename='errors.log',level=logging.DEBUG)
# fields
fields = ['009', '011', '039', '087', '211', '212', '214', '241', '265', '301',
'302', '303', '304', '305', '308', '315', '350', '359', '440', '503',
'512', '517', '523', '527', '537', '543', '570', '582', '652', '680',
'681', '683', '705', '715', '751', '755', '840', '851', '870', '871',
'872', '873']
# output
for field in fields:
with open(f'.//output//{field}s.txt', 'w', encoding="utf-8") as output:
output.write("MMS Id"+"\n")
# progressbar
#total_records = 3715163
total_records = 3187887
pbar = tqdm(total=total_records)
counter = 0
# open marc file and read contents
# change the slice to limit files
for filename in os.listdir('.//data'): # <------- check this limiter
with open(f'data/{filename}', 'rb') as fh:
reader = MARCReader(fh, to_unicode=False, utf8_handling='ignore')
# progressbar
pbar.set_description(f"Reading {filename}")
# loop through records
try:
for record in reader:
counter += 1
pbar.update(1)
# find obsolete fields
for obsolete_field in fields:
if record[obsolete_field]:
# get mms_id
for field in record.get_fields('001'):
mms_id = field.data
mms_id = mms_id.decode('utf-8')
# write to file
with open(f'.//output//{obsolete_field}s.txt', 'a', encoding="utf-8") as output:
output.write(mms_id+"\n")
except UnicodeDecodeError as error:
logging.info(error)
pbar.update(1)
pass
except Exception as error:
logging.info(error)
pbar.update(1)
pass
# finish
pbar.set_description(f"Finished")
print(counter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment