jackflaps · September 26, 2019 21:32
diff --git a/add_from_csv.py b/add_from_csv.py
 #!/usr/bin/env python

 # this only works on archival_object records for now

 import argparse, csv, json, os, re, sys
 from asnake.aspace import ASpace

 parser = argparse.ArgumentParser(description='Add or update ArchivesSpace metadata properties from CSV input')
 parser.add_argument('-d', '--dry-run', help='Show the updates to be made without making them', action='store_true')
 parser.add_argument('-f', '--file', help='The CSV file containing the metadata to add')
 args = parser.parse_args()

 AS = ASpace()

 allowed_fields = ['title', 'component_id', 'subjects', 'creators', 'abstract', 'scopecontent', 'repository_link']
 repeatable_fields = ['subjects', 'creators']
 note_types = ['abstract', 'scopecontent']
 singlepart_notes = ['abstract']

 class CSVValidationError(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return(repr(self.value))


 class PatternValidationError(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return(repr(self.value))


 def get_json(uri):
    r = AS.client.get(uri)
    if r.status_code == 200:
        return json.loads(r.text)
    else:
        r.raise_for_status()


 def post_json(uri, data):
    r = AS.client.post(uri, json=data)
    message = json.loads(r.text)
    if r.status_code == 200:
        print("{}: {}".format(message['status'], message['uri']))
    else:
        print("Error: {}".format(message['error']))


 def validate_csv(headers):
    if headers[0] != 'uri':
        raise CSVValidationError("First column of CSV must be a URI")
    else:
        headers.pop(0)
        for header in headers:
            header = header.strip()
            if header not in list(allowed_fields):
                raise CSVValidationError("Invalid header in CSV: {}".format(header))
        return headers


 def validate_pattern(ref, pattern):
    if not re.compile(pattern).match(ref):
        raise PatternValidationError("Term must match the pattern: /{}/".format(pattern))
    else:
        return ref


 def update(msg):
    yes = set(['yes', 'y', 'ye'])
    no = set(['no', 'n'])

    while True:
        choice = input(msg).lower()
        if choice in yes:
            return True
        elif choice in no:
            return False
        else:
            print("Please respond with yes or no")


 def handle_repeatable_fields(obj, k, v):
    if k == "subjects":
        p = "^\/subjects\/[0-9]+$"
    elif k == "creators":
        p = "^\/agents\/(people|corporate_entities|families)\/[0-9]+$"

    refs = v.split(";")
    for ref in refs:
        ref = ref.strip()
        try:
            ref = validate_pattern(ref, p)
        except PatternValidationError as e:
            print("Couldn't add {} to {}: ".format(ref, k), e.value)
        else:
            if k == "subjects":
                if [n for n in obj[k] if n['ref'] == ref]:
                    print("{} already in {}".format(ref, k))
                else:
                    print("add {} to {}".format(ref, k))
                    obj[k].append({'ref': ref})
            elif k == "creators":
                if [n for n in obj['linked_agents'] if n['ref'] == ref and n['role'] == "creator"]:
                    print("{} already in linked_agents".format(ref))
                else:
                    print("add {} to linked_agents".format(ref))
                    obj['linked_agents'].append({'ref': ref, 'role': "creator"})
    return obj


 # first, ask for the file
 if args.file:
    file = args.file
 else:
    file = input("Tell me where your file lives: ")

 # only run the script if the file exists
 if os.path.exists(file):
    with open(file, 'r') as f:
        reader = csv.reader(f)
        headers = next(reader, None)
        try:
            headers = validate_csv(headers)
        except CSVValidationError as e:
            print("CSV validation error:", e.value)
        else:
            for row in reader:
                uri = row.pop(0)
                obj = get_json(uri)
                for idx, val in enumerate(row):
                    if headers[idx] in list(repeatable_fields):
                        obj = handle_repeatable_fields(obj, headers[idx], val)
                    else:
                        print("* {}: {} (update)".format(headers[idx], val))
                        if headers[idx] in list(note_types):
                            notes = [n for n in obj['notes'] if n['type'] == headers[idx]]
                            if headers[idx] in list(singlepart_notes):
                                if notes:
                                    for note in obj['notes']:
                                        if note['type'] == headers[idx]:
                                            note['content'] = [val]
                                else:
                                    obj['notes'].append({
                                        'jsonmodel_type': "note_singlepart",
                                        'type': headers[idx],
                                        'content': [val],
                                        'publish': True
                                    })
                            else:
                                if notes:
                                    for note in obj['notes']:
                                        if note['type'] == headers[idx]:
                                            note['subnotes'][0]['content'] = val
                                else:
                                    obj['notes'].append({
                                        'jsonmodel_type': "note_multipart",
                                        'type': headers[idx],
                                        'subnotes': [{'jsonmodel_type': "note_text", 'content': val, 'publish': True}],
                                        'publish': True
                                    })
                        elif headers[idx] == "repository_link":
                            if not obj['external_documents']:
                                obj['external_documents'].append({'title': "Special Collections @ DU", 'location': val})
                        else:
                            if obj[headers[idx]] != val:
                                obj[headers[idx]] = val

                if args.dry_run:
                    print("dry run: {}".format(uri))
                else:
                    post_json(uri, obj)
 else:
    print("File not found: {}".format(file))
	#!/usr/bin/env python

	# this only works on archival_object records for now

	import argparse, csv, json, os, re, sys
	from asnake.aspace import ASpace

	parser = argparse.ArgumentParser(description='Add or update ArchivesSpace metadata properties from CSV input')
	parser.add_argument('-d', '--dry-run', help='Show the updates to be made without making them', action='store_true')
	parser.add_argument('-f', '--file', help='The CSV file containing the metadata to add')
	args = parser.parse_args()

	AS = ASpace()

	allowed_fields = ['title', 'component_id', 'subjects', 'creators', 'abstract', 'scopecontent', 'repository_link']
	repeatable_fields = ['subjects', 'creators']
	note_types = ['abstract', 'scopecontent']
	singlepart_notes = ['abstract']

	class CSVValidationError(Exception):
	def __init__(self, value):
	self.value = value

	def __str__(self):
	return(repr(self.value))


	class PatternValidationError(Exception):
	def __init__(self, value):
	self.value = value

	def __str__(self):
	return(repr(self.value))


	def get_json(uri):
	r = AS.client.get(uri)
	if r.status_code == 200:
	return json.loads(r.text)
	else:
	r.raise_for_status()


	def post_json(uri, data):
	r = AS.client.post(uri, json=data)
	message = json.loads(r.text)
	if r.status_code == 200:
	print("{}: {}".format(message['status'], message['uri']))
	else:
	print("Error: {}".format(message['error']))


	def validate_csv(headers):
	if headers[0] != 'uri':
	raise CSVValidationError("First column of CSV must be a URI")
	else:
	headers.pop(0)
	for header in headers:
	header = header.strip()
	if header not in list(allowed_fields):
	raise CSVValidationError("Invalid header in CSV: {}".format(header))
	return headers


	def validate_pattern(ref, pattern):
	if not re.compile(pattern).match(ref):
	raise PatternValidationError("Term must match the pattern: /{}/".format(pattern))
	else:
	return ref


	def update(msg):
	yes = set(['yes', 'y', 'ye'])
	no = set(['no', 'n'])

	while True:
	choice = input(msg).lower()
	if choice in yes:
	return True
	elif choice in no:
	return False
	else:
	print("Please respond with yes or no")


	def handle_repeatable_fields(obj, k, v):
	if k == "subjects":
	p = "^\/subjects\/[0-9]+$"
	elif k == "creators":
	p = "^\/agents\/(people\|corporate_entities\|families)\/[0-9]+$"

	refs = v.split(";")
	for ref in refs:
	ref = ref.strip()
	try:
	ref = validate_pattern(ref, p)
	except PatternValidationError as e:
	print("Couldn't add {} to {}: ".format(ref, k), e.value)
	else:
	if k == "subjects":
	if [n for n in obj[k] if n['ref'] == ref]:
	print("{} already in {}".format(ref, k))
	else:
	print("add {} to {}".format(ref, k))
	obj[k].append({'ref': ref})
	elif k == "creators":
	if [n for n in obj['linked_agents'] if n['ref'] == ref and n['role'] == "creator"]:
	print("{} already in linked_agents".format(ref))
	else:
	print("add {} to linked_agents".format(ref))
	obj['linked_agents'].append({'ref': ref, 'role': "creator"})
	return obj


	# first, ask for the file
	if args.file:
	file = args.file
	else:
	file = input("Tell me where your file lives: ")

	# only run the script if the file exists
	if os.path.exists(file):
	with open(file, 'r') as f:
	reader = csv.reader(f)
	headers = next(reader, None)
	try:
	headers = validate_csv(headers)
	except CSVValidationError as e:
	print("CSV validation error:", e.value)
	else:
	for row in reader:
	uri = row.pop(0)
	obj = get_json(uri)
	for idx, val in enumerate(row):
	if headers[idx] in list(repeatable_fields):
	obj = handle_repeatable_fields(obj, headers[idx], val)
	else:
	print("* {}: {} (update)".format(headers[idx], val))
	if headers[idx] in list(note_types):
	notes = [n for n in obj['notes'] if n['type'] == headers[idx]]
	if headers[idx] in list(singlepart_notes):
	if notes:
	for note in obj['notes']:
	if note['type'] == headers[idx]:
	note['content'] = [val]
	else:
	obj['notes'].append({
	'jsonmodel_type': "note_singlepart",
	'type': headers[idx],
	'content': [val],
	'publish': True
	})
	else:
	if notes:
	for note in obj['notes']:
	if note['type'] == headers[idx]:
	note['subnotes'][0]['content'] = val
	else:
	obj['notes'].append({
	'jsonmodel_type': "note_multipart",
	'type': headers[idx],
	'subnotes': [{'jsonmodel_type': "note_text", 'content': val, 'publish': True}],
	'publish': True
	})
	elif headers[idx] == "repository_link":
	if not obj['external_documents']:
	obj['external_documents'].append({'title': "Special Collections @ DU", 'location': val})
	else:
	if obj[headers[idx]] != val:
	obj[headers[idx]] = val

	if args.dry_run:
	print("dry run: {}".format(uri))
	else:
	post_json(uri, obj)
	else:
	print("File not found: {}".format(file))