Skip to content

Instantly share code, notes, and snippets.

@ngeraci
Last active March 4, 2021 01:02
Show Gist options
  • Select an option

  • Save ngeraci/8595b2bb81a4dbef5069be5b510a0730 to your computer and use it in GitHub Desktop.

Select an option

Save ngeraci/8595b2bb81a4dbef5069be5b510a0730 to your computer and use it in GitHub Desktop.
Python/ArchivesSpace adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on language in archival finding aid "Biography or History" (bioghist) notes.
""" Adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on aggrandizing
language in archival finding aid "Biography or History" (bioghist) notes.
The original script uses xquery on a directory of EAD XML files, and produces an XML report.
This version uses Python to query the ArchivesSpace REST API, and produces a JSON report.
"""
import re
import json
from asnake.aspace import ASpace
def main():
aspace = ASpace()
# single repository
repo = aspace.repositories(5)
# read list of terms to report on from a file & compile regular expression
aggrandizement_re = term_list("terms_of_aggrandizement.txt")
results = []
# iterate over all collections in a repository
for resource in repo.resources():
for note in resource.notes:
if note.json().get("type") == "bioghist":
for subnote in note.subnotes:
content = subnote.json().get("content")
if content:
matches = aggrandizement_re.findall(content)
if matches:
# append brief collection info to "results" list for report
coll_summary = {
"collectioncode": unitid(resource),
"title": resource.title,
"origination": origination(resource),
"matching_terms":
[match[0] for match in matches],
"bioghist": content
}
results.append(coll_summary)
# write json report to file
report_filename = f"{repo.slug}_bioghist_report.json"
with open(report_filename, "w") as outfile:
json.dump(results, outfile, indent=4)
def term_list(filepath):
""" takes a path to a text file containing a list of regular expressions (one per line)
returns one big, compiled, case-insensitive regex matching any of those expressions/terms
"""
with open(filepath, "r") as infile:
terms = fr"\b({infile.read()})\b".replace("\n", "|")
return re.compile(terms, re.IGNORECASE)
def unitid(resource):
""" make "unitid" value the same way ASpace does by default in EAD export
(backend/app/exporters/serializers/ead.rb#L193)
takes ArchivesSnake resource object
returns unitid string
"""
return ".".join(
list(
filter(None,
[resource.json().get(f"id_{i}") for i in range(0, 4)])))
def origination(resource):
""" approximates contents of EAD <origination> field for JSON output
if a resource's linked agent has the role "creator" or "source",
the agent is represented by a dictionary with the following keys:
- name (string)
- source (string)
- role (string, equal to "creator" or "source")
- authority_id (string, may represent a URI)
takes an ArchivesSnake resource object
returns a list of dicts
"""
creators_and_sources = []
for agent in resource.linked_agents:
for role in agent.linked_agent_roles:
if role in ["creator", "source"]:
creators_and_sources.append({
"name":
agent.display_name.sort_name,
"source":
agent.display_name.source,
"role":
role,
"authority_id":
agent.display_name.json().get("authority_id")
})
return creators_and_sources
if __name__ == '__main__':
main()
@ngeraci
Copy link
Author

ngeraci commented Mar 3, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment