Skip to content

Instantly share code, notes, and snippets.

@DominicBM
Last active October 21, 2015 22:02
Show Gist options
  • Save DominicBM/302206ec51c31119ded4 to your computer and use it in GitHub Desktop.
Save DominicBM/302206ec51c31119ded4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json, csv, argparse
parser = argparse.ArgumentParser()
parser.add_argument('--series_NAID', dest='series_NAID', metavar='SERIES_NAID',
action='store')
parser.add_argument('--file_units', dest='file_units', metavar='FILE_UNITS',
action='store')
args = parser.parse_args()
series_NAID = args.series_NAID
file_units = args.file_units
logfile = str(series_NAID) + '.csv'
offset = 0
file_unit = ''
if file_units is 'y' :
file_unit = '.parentFileUnit'
geturl = 'https://catalog.archives.gov/api/v1?type=description&sort=naId asc&rows=200&description.item' + file_unit + '.parentSeries=' + str(series_NAID)
parsed = json.loads(requests.get(geturl).text)
total = parsed['opaResponse']['results']['total']
if total == 0:
print '\n ** No valid items found! Try changing file_units setting. **\n'
else :
with open(logfile, 'w') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
writelog.writerow( ( 'NAID', 'URL', 'Title', 'Local identifier', 'Parent file unit NAID', 'Parent file unit title', 'Parent series NAID', 'Parent series title', 'Container ID', 'Scope and content note', 'File URL' ) )
log.close()
while offset < total:
parsed = json.loads(requests.get(geturl + '&offset=' + str(offset)).text)
row = 0
rowmax = 200
if (total - offset) < 200:
rowmax = total - offset
while row < rowmax:
try:
naid = parsed['opaResponse']['results']['result'][row]['naId']
except:
print parsed['opaResponse']['results']['result'][row]
break
url = 'https://catalog.archives.gov/id/' + str(naid)
title = parsed['opaResponse']['results']['result'][row]['description']['item']['title']
local_identifier = parsed['opaResponse']['results']['result'][row]['description']['item']['localIdentifier']
try:
parent_file_unit = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['title']
parent_file_unit_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['title']
parent_series_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['parentSeries']['title']
parent_series = parsed['opaResponse']['results']['result'][row]['description']['item']['parentFileUnit']['parentSeries']['title']
except KeyError:
parent_file_unit = ''
parent_file_unit_naid = ''
parent_series_naid = parsed['opaResponse']['results']['result'][row]['description']['item']['parentSeries']['title']
parent_series = parsed['opaResponse']['results']['result'][row]['description']['item']['parentSeries']['title']
try :
container_id = parsed['opaResponse']['results']['result'][row]['description']['item']['physicalOccurrenceArray']['itemPhysicalOccurrence']['mediaOccurrenceArray']['itemMediaOccurrence']['containerId']
except KeyError:
container_id = ''
try:
scope_and_content_note = parsed['opaResponse']['results']['result'][row]['description']['item']['scopeAndContentNote']
except KeyError:
scope_and_content_note = ''
try:
file_url = parsed['opaResponse']['results']['result'][row]['objects']['object']['file']['@url']
except KeyError:
try:
file_url = parsed['opaResponse']['results']['result'][row]['objects'][0]['object']['file']['@url']
except :
file_url = ''
print ' Writing "' + title + '" (NAID ' + naid + ') to ' + logfile + '.\n'
with open(logfile, 'a') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
writelog.writerow( ( naid, url, title.encode('utf-8'), local_identifier, parent_file_unit_naid, parent_file_unit.encode('utf-8'), parent_series_naid, parent_series.encode('utf-8'), container_id, (scope_and_content_note).encode('utf-8'), file_url ) )
log.close()
row = row + 1
offset = offset + 200
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment