Created
February 28, 2020 20:58
-
-
Save MrJeremyHobbs/39bfb2f8774b30b37c850f6b091e28ae to your computer and use it in GitHub Desktop.
PyMARC clean-up example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os | |
import logging | |
from pymarc import MARCReader | |
from tqdm import tqdm | |
# clean-up | |
try: | |
os.system('cls') | |
os.system('del errors.log') | |
os.remove('output') # del folder with old logs | |
os.mkdir('output') # re-create dir | |
except: | |
pass | |
# logging | |
logging.basicConfig(filename='errors.log',level=logging.DEBUG) | |
# fields | |
fields = ['009', '011', '039', '087', '211', '212', '214', '241', '265', '301', | |
'302', '303', '304', '305', '308', '315', '350', '359', '440', '503', | |
'512', '517', '523', '527', '537', '543', '570', '582', '652', '680', | |
'681', '683', '705', '715', '751', '755', '840', '851', '870', '871', | |
'872', '873'] | |
# output | |
for field in fields: | |
with open(f'.//output//{field}s.txt', 'w', encoding="utf-8") as output: | |
output.write("MMS Id"+"\n") | |
# progressbar | |
#total_records = 3715163 | |
total_records = 3187887 | |
pbar = tqdm(total=total_records) | |
counter = 0 | |
# open marc file and read contents | |
# change the slice to limit files | |
for filename in os.listdir('.//data'): # <------- check this limiter | |
with open(f'data/{filename}', 'rb') as fh: | |
reader = MARCReader(fh, to_unicode=False, utf8_handling='ignore') | |
# progressbar | |
pbar.set_description(f"Reading {filename}") | |
# loop through records | |
try: | |
for record in reader: | |
counter += 1 | |
pbar.update(1) | |
# find obsolete fields | |
for obsolete_field in fields: | |
if record[obsolete_field]: | |
# get mms_id | |
for field in record.get_fields('001'): | |
mms_id = field.data | |
mms_id = mms_id.decode('utf-8') | |
# write to file | |
with open(f'.//output//{obsolete_field}s.txt', 'a', encoding="utf-8") as output: | |
output.write(mms_id+"\n") | |
except UnicodeDecodeError as error: | |
logging.info(error) | |
pbar.update(1) | |
pass | |
except Exception as error: | |
logging.info(error) | |
pbar.update(1) | |
pass | |
# finish | |
pbar.set_description(f"Finished") | |
print(counter) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment