Skip to content

Instantly share code, notes, and snippets.

@jackflaps
Created September 26, 2019 21:32
Show Gist options
  • Save jackflaps/d8dd2ab03a6f01cbd17170017c36e6cd to your computer and use it in GitHub Desktop.
Save jackflaps/d8dd2ab03a6f01cbd17170017c36e6cd to your computer and use it in GitHub Desktop.
draft script for updating archivesspace records from CSV
#!/usr/bin/env python
# this only works on archival_object records for now
import argparse, csv, json, os, re, sys
from asnake.aspace import ASpace
parser = argparse.ArgumentParser(description='Add or update ArchivesSpace metadata properties from CSV input')
parser.add_argument('-d', '--dry-run', help='Show the updates to be made without making them', action='store_true')
parser.add_argument('-f', '--file', help='The CSV file containing the metadata to add')
args = parser.parse_args()
AS = ASpace()
allowed_fields = ['title', 'component_id', 'subjects', 'creators', 'abstract', 'scopecontent', 'repository_link']
repeatable_fields = ['subjects', 'creators']
note_types = ['abstract', 'scopecontent']
singlepart_notes = ['abstract']
class CSVValidationError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return(repr(self.value))
class PatternValidationError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return(repr(self.value))
def get_json(uri):
r = AS.client.get(uri)
if r.status_code == 200:
return json.loads(r.text)
else:
r.raise_for_status()
def post_json(uri, data):
r = AS.client.post(uri, json=data)
message = json.loads(r.text)
if r.status_code == 200:
print("{}: {}".format(message['status'], message['uri']))
else:
print("Error: {}".format(message['error']))
def validate_csv(headers):
if headers[0] != 'uri':
raise CSVValidationError("First column of CSV must be a URI")
else:
headers.pop(0)
for header in headers:
header = header.strip()
if header not in list(allowed_fields):
raise CSVValidationError("Invalid header in CSV: {}".format(header))
return headers
def validate_pattern(ref, pattern):
if not re.compile(pattern).match(ref):
raise PatternValidationError("Term must match the pattern: /{}/".format(pattern))
else:
return ref
def update(msg):
yes = set(['yes', 'y', 'ye'])
no = set(['no', 'n'])
while True:
choice = input(msg).lower()
if choice in yes:
return True
elif choice in no:
return False
else:
print("Please respond with yes or no")
def handle_repeatable_fields(obj, k, v):
if k == "subjects":
p = "^\/subjects\/[0-9]+$"
elif k == "creators":
p = "^\/agents\/(people|corporate_entities|families)\/[0-9]+$"
refs = v.split(";")
for ref in refs:
ref = ref.strip()
try:
ref = validate_pattern(ref, p)
except PatternValidationError as e:
print("Couldn't add {} to {}: ".format(ref, k), e.value)
else:
if k == "subjects":
if [n for n in obj[k] if n['ref'] == ref]:
print("{} already in {}".format(ref, k))
else:
print("add {} to {}".format(ref, k))
obj[k].append({'ref': ref})
elif k == "creators":
if [n for n in obj['linked_agents'] if n['ref'] == ref and n['role'] == "creator"]:
print("{} already in linked_agents".format(ref))
else:
print("add {} to linked_agents".format(ref))
obj['linked_agents'].append({'ref': ref, 'role': "creator"})
return obj
# first, ask for the file
if args.file:
file = args.file
else:
file = input("Tell me where your file lives: ")
# only run the script if the file exists
if os.path.exists(file):
with open(file, 'r') as f:
reader = csv.reader(f)
headers = next(reader, None)
try:
headers = validate_csv(headers)
except CSVValidationError as e:
print("CSV validation error:", e.value)
else:
for row in reader:
uri = row.pop(0)
obj = get_json(uri)
for idx, val in enumerate(row):
if headers[idx] in list(repeatable_fields):
obj = handle_repeatable_fields(obj, headers[idx], val)
else:
print("* {}: {} (update)".format(headers[idx], val))
if headers[idx] in list(note_types):
notes = [n for n in obj['notes'] if n['type'] == headers[idx]]
if headers[idx] in list(singlepart_notes):
if notes:
for note in obj['notes']:
if note['type'] == headers[idx]:
note['content'] = [val]
else:
obj['notes'].append({
'jsonmodel_type': "note_singlepart",
'type': headers[idx],
'content': [val],
'publish': True
})
else:
if notes:
for note in obj['notes']:
if note['type'] == headers[idx]:
note['subnotes'][0]['content'] = val
else:
obj['notes'].append({
'jsonmodel_type': "note_multipart",
'type': headers[idx],
'subnotes': [{'jsonmodel_type': "note_text", 'content': val, 'publish': True}],
'publish': True
})
elif headers[idx] == "repository_link":
if not obj['external_documents']:
obj['external_documents'].append({'title': "Special Collections @ DU", 'location': val})
else:
if obj[headers[idx]] != val:
obj[headers[idx]] = val
if args.dry_run:
print("dry run: {}".format(uri))
else:
post_json(uri, obj)
else:
print("File not found: {}".format(file))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment