Last active
June 19, 2018 18:07
-
-
Save clingerman/75b4255bbe6dcc6431f9dd70d37786dd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv, xml, re, time, os, datetime, argparse | |
import xml.etree.ElementTree as ET | |
# Construct python command like: python reformat_partner_xml.py | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--series', dest='series', metavar='SERIES', | |
action='store') | |
parser.add_argument('--objects', dest='objects', metavar='OBJECTS', | |
action='store') | |
parser.add_argument('--pub', dest='pub', metavar='PUB', | |
action='store') | |
args = parser.parse_args() | |
series = 2668739 | |
if args.series is not None: | |
series = args.series | |
objectfile = args.objects | |
pub = 'A3604' | |
if args.pub is not None: | |
pub = args.pub | |
sequence_order = 1 | |
file = str(pub) | |
## This part takes the partner XML and reformats it to more usable XML (i.e. going from attributes to elements - http://www.ibm.com/developerworks/library/x-eleatt/). The reformatted XML is saved as a new document with "_(reformatted)" appended to the name, so that the original file is not altered. | |
tree = ET.parse('metadata/' + file +'.xml') | |
root = tree.getroot() | |
Publication_Number = 'A3604' | |
Publication_Title = 'Passenger Lists of Vessels Arriving at Boston, Massachusetts, 1944-1954' | |
print (str(datetime.datetime.now().time()) + ': ' + Publication_Number, Publication_Title) | |
## This parses the XML for each row element. Default values for title components are "[BLANK]". | |
for row in root.findall('row'): | |
with open('objects/' + file + '.csv', 'r') as log : | |
readfile = csv.reader(log, delimiter= ',') | |
file_name = '' | |
id = '' | |
arrival_year = '[BLANK]' | |
arrival_month = 'BLANK]' | |
roll_number = '[BLANK]' | |
ship_name = '[BLANK]' | |
airline = '[BLANK]' | |
flight_no = '[BLANK]' | |
port_of_arrival = '[BLANK]' | |
arrival_day = '[BLANK]' | |
file_size = '' | |
if row.find('ImageFileName') is not None: | |
file_name = row.find('ImageFileName').text | |
if row.find('StableURL') is not None: | |
id = row.find('StableURL').text | |
if row.find('ArrivalYear') is not None: | |
arrival_year = row.find('ArrivalYear').text | |
if row.find('ArrivalMonth') is not None: | |
arrival_month = row.find('ArrivalMonth').text | |
if row.find('NARARollNumber') is not None: | |
roll_number = row.find('NARARollNumber').text | |
if row.find('ShipName') is not None: | |
ship_name = row.find('ShipName').text | |
if row.find('Airline') is not None: | |
airline = row.find('Airline').text | |
if row.find('FlightNo') is not None: | |
flight_no = row.find('FlightNo').text | |
if row.find('PortofArrival') is not None: | |
port_of_arrival = row.find('PortofArrival').text | |
if row.find('ArrivalDay') is not None: | |
arrival_day = row.find('ArrivalDay').text | |
## Changes the given .jp2 file name to .jpg and adding the roll/publication information. Then, using the new file name extracted from partner data, look it up in the images CSV to extract file size, file path, and label flag fields. | |
new_file_name = file_name[:-4] + '.jpg' | |
for row in readfile: | |
try: | |
if new_file_name == row[3]: | |
file_size = str(row[1]) | |
file_path = row[0] | |
label_flag = row[3] | |
except IndexError: | |
pass | |
## Changes month abbreviations to full names. | |
if arrival_month == '': | |
arrival_month_full = '' | |
if arrival_month == 'Jan': | |
arrival_month_full = 'January' | |
if arrival_month == 'Feb': | |
arrival_month_full = 'February' | |
if arrival_month == 'Mar': | |
arrival_month_full = 'March' | |
if arrival_month == 'Apr': | |
arrival_month_full = 'April' | |
if arrival_month == 'May': | |
arrival_month_full = 'May' | |
if arrival_month == 'Jun': | |
arrival_month_full = 'June' | |
if arrival_month == 'Jul': | |
arrival_month_full = 'July' | |
if arrival_month == 'Aug': | |
arrival_month_full = 'August' | |
if arrival_month == 'Sep': | |
arrival_month_full = 'September' | |
if arrival_month == 'Oct': | |
arrival_month_full = 'October' | |
if arrival_month == 'Nov': | |
arrival_month_full = 'November' | |
if arrival_month == 'Dec': | |
arrival_month_full = 'December' | |
## Generates the title based on the established formula. | |
title = ('Passenger lists of vessels arriving at Boston, Massachusetts in ' + arrival_month_full +' ' + arrival_year) | |
## Changes month names to numbers. | |
if arrival_month == '': | |
arrival_month_num = '' | |
if arrival_month == 'Jan': | |
arrival_month_num = '1' | |
if arrival_month == 'Feb': | |
arrival_month_num = '2' | |
if arrival_month == 'Mar': | |
arrival_month_num = '3' | |
if arrival_month == 'Apr': | |
arrival_month_num = '4' | |
if arrival_month == 'May': | |
arrival_month_num = '5' | |
if arrival_month == 'Jun': | |
arrival_month_num = '6' | |
if arrival_month == 'Jul': | |
arrival_month_num = '7' | |
if arrival_month == 'Aug': | |
arrival_month_num = '8' | |
if arrival_month == 'Sep': | |
arrival_month_num = '9' | |
if arrival_month == 'Oct': | |
arrival_month_num = '10' | |
if arrival_month == 'Nov': | |
arrival_month_num = '11' | |
if arrival_month == 'Dec': | |
arrival_month_num = '12' | |
## Using all above parsed fields, generate the whole output XML document with 3 parts. XML_top (everything above the digital objects, since this will only appear once per title), digital_objects (so that it can repeat for each new row in the same title/file unit), and XML_bottom (completes each file unit). | |
DASxml_top = """ | |
<fileUnit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> | |
<sequenceOrder>""" + str(sequence_order) + """</sequenceOrder> | |
<title>""" + str(title) + """</title> | |
<parentSeries><naId>""" + str(series) + """</naId></parentSeries> | |
<coverageDates> | |
<coverageStartDate><year>""" + str(arrival_year) + """</year><month>""" + str(arrival_month_num) + """</month></coverageStartDate> | |
<coverageEndDate><year>""" + str(arrival_year) + """</year><month>""" + str(arrival_month_num) + """</month></coverageEndDate> | |
</coverageDates> | |
<generalRecordsTypeArray><generalRecordsType><termName>Textual Records</termName></generalRecordsType></generalRecordsTypeArray> | |
<onlineResourceArray><onlineResource><termName>""" + str(id) + """</termName><description>Ancestry</description><note>This file was scanned as part of a collaboration effort between Ancestry and the National Archives.</note></onlineResource></onlineResourceArray> | |
<variantControlNumberArray><variantControlNumber><number>Ancestry 2017</number><type><termName>Search Identifier</termName></type></variantControlNumber></variantControlNumberArray> | |
<microformPublicationArray><microformPublication><note>The start of this file can be found on Roll """ + str(roll_number) + """.</note><publication><termName>A3604 Passenger Lists of Vessels Arriving at Boston, Massachusetts, 1944-1954</termName></publication></microformPublication></microformPublicationArray> | |
<dataControlGroup><groupCd>RDTP1</groupCd><groupId>ou=RDTP1,ou=groups</groupId></dataControlGroup> | |
<accessRestriction><status><termName>Unrestricted</termName></status></accessRestriction> | |
<useRestriction><status><termName>Unrestricted</termName></status></useRestriction> | |
<physicalOccurrenceArray><fileUnitPhysicalOccurrence> | |
<copyStatus><termName>Preservation-Reproduction-Reference</termName> </copyStatus><referenceUnitArray><referenceUnit><termName>National Archives at Washington, DC - Textual Reference</termName> </referenceUnit></referenceUnitArray> | |
<locationArray><location><facility><termName>National Archives Building - Archives I (Washington, DC)</termName> </facility></location></locationArray> | |
<mediaOccurrenceArray><mediaOccurrence><containerId></containerId><specificMediaType><termName>Microfilm</termName></specificMediaType> | |
<generalMediaTypeArray><generalMediaType><termName>Microform</termName></generalMediaType></generalMediaTypeArray> | |
</mediaOccurrence></mediaOccurrenceArray> | |
</fileUnitPhysicalOccurrence></physicalOccurrenceArray> | |
<digitalObjectArray> | |
""" | |
digital_objects = """<digitalObject><objectType><termName>Image (JPG)</termName></objectType><labelFlag>""" + str(label_flag) + """</labelFlag> | |
<objectDescription>Ship Name: """ + str(ship_name) + """. Image provided by Ancestry.</objectDescription> | |
<accessFilename>https://s3.amazonaws.com/NARAprodstorage/""" + str(file_path) + """</accessFilename><accessFileSize>""" + str(file_size) + """</accessFileSize> | |
<thumbnailFilename>http://media.nara.gov/dc-metro/jpg_t.jpg</thumbnailFilename><thumbnailFileSize>1234</thumbnailFileSize></digitalObject> | |
""" | |
DASxml_bottom = """</digitalObjectArray> | |
</fileUnit> | |
""" | |
## The final code: (1) creates a file for the DAS XML output if one does not yet exist, (2) writes each unique title to a separate CSV, (3) checks the CSV and writes DAS_top and digital_objects if the title is unique (it's a new file unit) or just digital_objects if it is not (it's an additional row within the current file unit), and then (4) writes the end tags for each completed file unit. It also logs each file unit in a separate log.txt, which is easier to read for progress than the full XML document. | |
with open('uniquetest.csv', 'r', encoding='utf-8') as log : | |
test = False | |
readlog = csv.reader(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
for row in readlog: | |
if title == row[0]: | |
test = True | |
f = open(file + '_output.xml', 'a', encoding='utf-8') | |
f.write(digital_objects) | |
f.close() | |
# f = open('log.txt', 'a') | |
# f.write( ' ' + ' ' + label_flag + str(id) + str(file_size) + file_path + """ | |
# """) | |
# f.close() | |
if test is False: | |
with open('uniquetest.csv', 'a', encoding='utf-8') as write: | |
writelog = csv.writer(write, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
writelog.writerow( (title, ) ) | |
try: | |
f = open(file + '_output.xml', 'r', encoding='utf-8') | |
f = open(file + '_output.xml', 'a', encoding='utf-8') | |
f.write(DASxml_bottom + DASxml_top + digital_objects) | |
f.close() | |
sequence_order = sequence_order + 1 | |
# f = open('log.txt', 'a') | |
# f.write(title + str(roll) + label_flag + str(id) + str(file_size) + file_path + """ | |
# """) | |
# f.close() | |
except IOError: | |
f = open(file + '_output.xml', 'a', encoding='utf-8') | |
f.write(DASxml_top + digital_objects) | |
f.close() | |
sequence_order = sequence_order + 1 | |
# f = open('log.txt', 'a') | |
# f.write(title + str(roll) + label_flag + str(id) + str(file_size) + file_path + """ | |
# """) | |
# f.close() | |
f = open(file + '_output.xml', 'a', encoding='utf-8') | |
f.write(DASxml_bottom) | |
f.close() | |
f = open('uniquetest.csv', 'w') | |
f.write('') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment