Last active
October 21, 2015 22:02
-
-
Save DominicBM/302206ec51c31119ded4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import requests, json, csv, argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--series_NAID', dest='series_NAID', metavar='SERIES_NAID', | |
action='store') | |
parser.add_argument('--file_units', dest='file_units', metavar='FILE_UNITS', | |
action='store') | |
args = parser.parse_args() | |
series_NAID = args.series_NAID | |
file_units = args.file_units | |
logfile = str(series_NAID) + '.csv' | |
offset = 0 | |
file_unit = '' | |
if file_units is 'y' : | |
file_unit = '.parentFileUnit' | |
geturl = 'https://catalog.archives.gov/api/v1?type=description&sort=naId asc&rows=200&description.item' + file_unit + '.parentSeries=' + str(series_NAID) | |
parsed = json.loads(requests.get(geturl).text) | |
total = parsed['opaResponse']['results']['total'] | |
if total == 0: | |
print '\n ** No valid items found! Try changing file_units setting. **\n' | |
else : | |
with open(logfile, 'w') as log : | |
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
writelog.writerow( ( 'NAID', 'URL', 'Title', 'Local identifier', 'Parent file unit NAID', 'Parent file unit title', 'Parent series NAID', 'Parent series title', 'Container ID', 'Scope and content note', 'File URL' ) ) | |
log.close() | |
while offset < total: | |
parsed = json.loads(requests.get(geturl + '&offset=' + str(offset)).text) | |
row = 0 | |
rowmax = 200 | |
if (total - offset) < 200: | |
rowmax = total - offset | |
while row < rowmax: | |
try: | |
naid = parsed['opaResponse']['results']['result'][row]['naId'] | |
except: | |
print parsed['opaResponse']['results']['result'][row] | |
break | |
url = 'https://catalog.archives.gov/id/' + str(naid) | |
title = parsed['opaResponse']['results']['result'][row]['description']['item']['title'] | |
local_identifier = parsed['opaResponse']['results']['result'][row]['description']['item']['localIdentifier'] | |
try: | |
parent_file_unit = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['title'] | |
parent_file_unit_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['title'] | |
parent_series_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['parentSeries']['title'] | |
parent_series = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['parentSeries']['title'] | |
except KeyError: | |
parent_file_unit = '' | |
parent_file_unit_naid = '' | |
parent_series_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentSeries']['title'] | |
parent_series = parsed['opaResponse']['results']['result'][row]['description']['item']['parentSeries']['title'] | |
try : | |
container_id = parsed['opaResponse']['results']['result'][row]['description']['item']['physicalOccurrenceArray']['itemPhysicalOccurrence']['mediaOccurrenceArray']['itemMediaOccurrence']['containerId'] | |
except KeyError: | |
container_id = '' | |
try: | |
scope_and_content_note = parsed['opaResponse']['results']['result'][row]['description']['item']['scopeAndContentNote'] | |
except KeyError: | |
scope_and_content_note = '' | |
try: | |
file_url = parsed['opaResponse']['results']['result'][row]['objects']['object']['file']['@url'] | |
except KeyError: | |
try: | |
file_url = parsed['opaResponse']['results']['result'][row]['objects'][0]['object']['file']['@url'] | |
except : | |
file_url = '' | |
print ' Writing "' + title + '" (NAID ' + naid + ') to ' + logfile + '.\n' | |
with open(logfile, 'a') as log : | |
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL) | |
writelog.writerow( ( naid, url, title.encode('utf-8'), local_identifier, parent_file_unit_naid, parent_file_unit.encode('utf-8'), parent_series_naid, parent_series.encode('utf-8'), container_id, (scope_and_content_note).encode('utf-8'), file_url ) ) | |
log.close() | |
row = row + 1 | |
offset = offset + 200 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment