Created
September 26, 2019 21:32
-
-
Save jackflaps/d8dd2ab03a6f01cbd17170017c36e6cd to your computer and use it in GitHub Desktop.
draft script for updating archivesspace records from CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# this only works on archival_object records for now | |
import argparse, csv, json, os, re, sys | |
from asnake.aspace import ASpace | |
parser = argparse.ArgumentParser(description='Add or update ArchivesSpace metadata properties from CSV input') | |
parser.add_argument('-d', '--dry-run', help='Show the updates to be made without making them', action='store_true') | |
parser.add_argument('-f', '--file', help='The CSV file containing the metadata to add') | |
args = parser.parse_args() | |
AS = ASpace() | |
allowed_fields = ['title', 'component_id', 'subjects', 'creators', 'abstract', 'scopecontent', 'repository_link'] | |
repeatable_fields = ['subjects', 'creators'] | |
note_types = ['abstract', 'scopecontent'] | |
singlepart_notes = ['abstract'] | |
class CSVValidationError(Exception): | |
def __init__(self, value): | |
self.value = value | |
def __str__(self): | |
return(repr(self.value)) | |
class PatternValidationError(Exception): | |
def __init__(self, value): | |
self.value = value | |
def __str__(self): | |
return(repr(self.value)) | |
def get_json(uri): | |
r = AS.client.get(uri) | |
if r.status_code == 200: | |
return json.loads(r.text) | |
else: | |
r.raise_for_status() | |
def post_json(uri, data): | |
r = AS.client.post(uri, json=data) | |
message = json.loads(r.text) | |
if r.status_code == 200: | |
print("{}: {}".format(message['status'], message['uri'])) | |
else: | |
print("Error: {}".format(message['error'])) | |
def validate_csv(headers): | |
if headers[0] != 'uri': | |
raise CSVValidationError("First column of CSV must be a URI") | |
else: | |
headers.pop(0) | |
for header in headers: | |
header = header.strip() | |
if header not in list(allowed_fields): | |
raise CSVValidationError("Invalid header in CSV: {}".format(header)) | |
return headers | |
def validate_pattern(ref, pattern): | |
if not re.compile(pattern).match(ref): | |
raise PatternValidationError("Term must match the pattern: /{}/".format(pattern)) | |
else: | |
return ref | |
def update(msg): | |
yes = set(['yes', 'y', 'ye']) | |
no = set(['no', 'n']) | |
while True: | |
choice = input(msg).lower() | |
if choice in yes: | |
return True | |
elif choice in no: | |
return False | |
else: | |
print("Please respond with yes or no") | |
def handle_repeatable_fields(obj, k, v): | |
if k == "subjects": | |
p = "^\/subjects\/[0-9]+$" | |
elif k == "creators": | |
p = "^\/agents\/(people|corporate_entities|families)\/[0-9]+$" | |
refs = v.split(";") | |
for ref in refs: | |
ref = ref.strip() | |
try: | |
ref = validate_pattern(ref, p) | |
except PatternValidationError as e: | |
print("Couldn't add {} to {}: ".format(ref, k), e.value) | |
else: | |
if k == "subjects": | |
if [n for n in obj[k] if n['ref'] == ref]: | |
print("{} already in {}".format(ref, k)) | |
else: | |
print("add {} to {}".format(ref, k)) | |
obj[k].append({'ref': ref}) | |
elif k == "creators": | |
if [n for n in obj['linked_agents'] if n['ref'] == ref and n['role'] == "creator"]: | |
print("{} already in linked_agents".format(ref)) | |
else: | |
print("add {} to linked_agents".format(ref)) | |
obj['linked_agents'].append({'ref': ref, 'role': "creator"}) | |
return obj | |
# first, ask for the file | |
if args.file: | |
file = args.file | |
else: | |
file = input("Tell me where your file lives: ") | |
# only run the script if the file exists | |
if os.path.exists(file): | |
with open(file, 'r') as f: | |
reader = csv.reader(f) | |
headers = next(reader, None) | |
try: | |
headers = validate_csv(headers) | |
except CSVValidationError as e: | |
print("CSV validation error:", e.value) | |
else: | |
for row in reader: | |
uri = row.pop(0) | |
obj = get_json(uri) | |
for idx, val in enumerate(row): | |
if headers[idx] in list(repeatable_fields): | |
obj = handle_repeatable_fields(obj, headers[idx], val) | |
else: | |
print("* {}: {} (update)".format(headers[idx], val)) | |
if headers[idx] in list(note_types): | |
notes = [n for n in obj['notes'] if n['type'] == headers[idx]] | |
if headers[idx] in list(singlepart_notes): | |
if notes: | |
for note in obj['notes']: | |
if note['type'] == headers[idx]: | |
note['content'] = [val] | |
else: | |
obj['notes'].append({ | |
'jsonmodel_type': "note_singlepart", | |
'type': headers[idx], | |
'content': [val], | |
'publish': True | |
}) | |
else: | |
if notes: | |
for note in obj['notes']: | |
if note['type'] == headers[idx]: | |
note['subnotes'][0]['content'] = val | |
else: | |
obj['notes'].append({ | |
'jsonmodel_type': "note_multipart", | |
'type': headers[idx], | |
'subnotes': [{'jsonmodel_type': "note_text", 'content': val, 'publish': True}], | |
'publish': True | |
}) | |
elif headers[idx] == "repository_link": | |
if not obj['external_documents']: | |
obj['external_documents'].append({'title': "Special Collections @ DU", 'location': val}) | |
else: | |
if obj[headers[idx]] != val: | |
obj[headers[idx]] = val | |
if args.dry_run: | |
print("dry run: {}".format(uri)) | |
else: | |
post_json(uri, obj) | |
else: | |
print("File not found: {}".format(file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment