ngeraci · March 4, 2021 01:02 · ngeraci · Mar 3, 2021
diff --git a/terms_of_aggrandizement.py b/terms_of_aggrandizement.py
 """ Adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on aggrandizing
    language in archival finding aid "Biography or History" (bioghist) notes.

    The original script uses xquery on a directory of EAD XML files, and produces an XML report.
    This version uses Python to query the ArchivesSpace REST API, and produces a JSON report.
 """

 import re
 import json
 from asnake.aspace import ASpace


 def main():

    aspace = ASpace()
    # single repository
    repo = aspace.repositories(5)

    # read list of terms to report on from a file & compile regular expression
    aggrandizement_re = term_list("terms_of_aggrandizement.txt")

    results = []
    # iterate over all collections in a repository
    for resource in repo.resources():
        for note in resource.notes:
            if note.json().get("type") == "bioghist":
                for subnote in note.subnotes:
                    content = subnote.json().get("content")
                    if content:
                        matches = aggrandizement_re.findall(content)
                        if matches:
                            # append brief collection info to "results" list for report
                            coll_summary = {
                                "collectioncode": unitid(resource),
                                "title": resource.title,
                                "origination": origination(resource),
                                "matching_terms":
                                [match[0] for match in matches],
                                "bioghist": content
                            }
                            results.append(coll_summary)

    # write json report to file
    report_filename = f"{repo.slug}_bioghist_report.json"
    with open(report_filename, "w") as outfile:
        json.dump(results, outfile, indent=4)


 def term_list(filepath):
    """ takes a path to a text file containing a list of regular expressions (one per line)
        returns one big, compiled, case-insensitive regex matching any of those expressions/terms
    """

    with open(filepath, "r") as infile:
        terms = fr"\b({infile.read()})\b".replace("\n", "|")

    return re.compile(terms, re.IGNORECASE)


 def unitid(resource):
    """ make "unitid" value the same way ASpace does by default in EAD export
        (backend/app/exporters/serializers/ead.rb#L193)

        takes ArchivesSnake resource object
        returns unitid string
    """

    return ".".join(
        list(
            filter(None,
                   [resource.json().get(f"id_{i}") for i in range(0, 4)])))


 def origination(resource):
    """ approximates contents of EAD <origination> field for JSON output

        if a resource's linked agent has the role "creator" or "source",
        the agent is represented by a dictionary with the following keys:
            - name (string)
            - source (string)
            - role (string, equal to "creator" or "source")
            - authority_id (string, may represent a URI)

        takes an ArchivesSnake resource object
        returns a list of dicts
    """

    creators_and_sources = []

    for agent in resource.linked_agents:
        for role in agent.linked_agent_roles:
            if role in ["creator", "source"]:
                creators_and_sources.append({
                    "name":
                    agent.display_name.sort_name,
                    "source":
                    agent.display_name.source,
                    "role":
                    role,
                    "authority_id":
                    agent.display_name.json().get("authority_id")
                })

    return creators_and_sources


 if __name__ == '__main__':
    main()
	""" Adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on aggrandizing
	language in archival finding aid "Biography or History" (bioghist) notes.

	The original script uses xquery on a directory of EAD XML files, and produces an XML report.
	This version uses Python to query the ArchivesSpace REST API, and produces a JSON report.
	"""

	import re
	import json
	from asnake.aspace import ASpace


	def main():

	aspace = ASpace()
	# single repository
	repo = aspace.repositories(5)

	# read list of terms to report on from a file & compile regular expression
	aggrandizement_re = term_list("terms_of_aggrandizement.txt")

	results = []
	# iterate over all collections in a repository
	for resource in repo.resources():
	for note in resource.notes:
	if note.json().get("type") == "bioghist":
	for subnote in note.subnotes:
	content = subnote.json().get("content")
	if content:
	matches = aggrandizement_re.findall(content)
	if matches:
	# append brief collection info to "results" list for report
	coll_summary = {
	"collectioncode": unitid(resource),
	"title": resource.title,
	"origination": origination(resource),
	"matching_terms":
	[match[0] for match in matches],
	"bioghist": content
	}
	results.append(coll_summary)

	# write json report to file
	report_filename = f"{repo.slug}_bioghist_report.json"
	with open(report_filename, "w") as outfile:
	json.dump(results, outfile, indent=4)


	def term_list(filepath):
	""" takes a path to a text file containing a list of regular expressions (one per line)
	returns one big, compiled, case-insensitive regex matching any of those expressions/terms
	"""

	with open(filepath, "r") as infile:
	terms = fr"\b({infile.read()})\b".replace("\n", "\|")

	return re.compile(terms, re.IGNORECASE)


	def unitid(resource):
	""" make "unitid" value the same way ASpace does by default in EAD export
	(backend/app/exporters/serializers/ead.rb#L193)

	takes ArchivesSnake resource object
	returns unitid string
	"""

	return ".".join(
	list(
	filter(None,
	[resource.json().get(f"id_{i}") for i in range(0, 4)])))


	def origination(resource):
	""" approximates contents of EAD <origination> field for JSON output

	if a resource's linked agent has the role "creator" or "source",
	the agent is represented by a dictionary with the following keys:
	- name (string)
	- source (string)
	- role (string, equal to "creator" or "source")
	- authority_id (string, may represent a URI)

	takes an ArchivesSnake resource object
	returns a list of dicts
	"""

	creators_and_sources = []

	for agent in resource.linked_agents:
	for role in agent.linked_agent_roles:
	if role in ["creator", "source"]:
	creators_and_sources.append({
	"name":
	agent.display_name.sort_name,
	"source":
	agent.display_name.source,
	"role":
	role,
	"authority_id":
	agent.display_name.json().get("authority_id")
	})

	return creators_and_sources


	if __name__ == '__main__':
	main()
No results found