Last active
March 4, 2021 01:02
-
-
Save ngeraci/8595b2bb81a4dbef5069be5b510a0730 to your computer and use it in GitHub Desktop.
Python/ArchivesSpace adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on language in archival finding aid "Biography or History" (bioghist) notes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ Adaptation of Kelly Bolding's Terms of Aggrandizement xquery script to report on aggrandizing | |
| language in archival finding aid "Biography or History" (bioghist) notes. | |
| The original script uses xquery on a directory of EAD XML files, and produces an XML report. | |
| This version uses Python to query the ArchivesSpace REST API, and produces a JSON report. | |
| """ | |
| import re | |
| import json | |
| from asnake.aspace import ASpace | |
| def main(): | |
| aspace = ASpace() | |
| # single repository | |
| repo = aspace.repositories(5) | |
| # read list of terms to report on from a file & compile regular expression | |
| aggrandizement_re = term_list("terms_of_aggrandizement.txt") | |
| results = [] | |
| # iterate over all collections in a repository | |
| for resource in repo.resources(): | |
| for note in resource.notes: | |
| if note.json().get("type") == "bioghist": | |
| for subnote in note.subnotes: | |
| content = subnote.json().get("content") | |
| if content: | |
| matches = aggrandizement_re.findall(content) | |
| if matches: | |
| # append brief collection info to "results" list for report | |
| coll_summary = { | |
| "collectioncode": unitid(resource), | |
| "title": resource.title, | |
| "origination": origination(resource), | |
| "matching_terms": | |
| [match[0] for match in matches], | |
| "bioghist": content | |
| } | |
| results.append(coll_summary) | |
| # write json report to file | |
| report_filename = f"{repo.slug}_bioghist_report.json" | |
| with open(report_filename, "w") as outfile: | |
| json.dump(results, outfile, indent=4) | |
| def term_list(filepath): | |
| """ takes a path to a text file containing a list of regular expressions (one per line) | |
| returns one big, compiled, case-insensitive regex matching any of those expressions/terms | |
| """ | |
| with open(filepath, "r") as infile: | |
| terms = fr"\b({infile.read()})\b".replace("\n", "|") | |
| return re.compile(terms, re.IGNORECASE) | |
| def unitid(resource): | |
| """ make "unitid" value the same way ASpace does by default in EAD export | |
| (backend/app/exporters/serializers/ead.rb#L193) | |
| takes ArchivesSnake resource object | |
| returns unitid string | |
| """ | |
| return ".".join( | |
| list( | |
| filter(None, | |
| [resource.json().get(f"id_{i}") for i in range(0, 4)]))) | |
| def origination(resource): | |
| """ approximates contents of EAD <origination> field for JSON output | |
| if a resource's linked agent has the role "creator" or "source", | |
| the agent is represented by a dictionary with the following keys: | |
| - name (string) | |
| - source (string) | |
| - role (string, equal to "creator" or "source") | |
| - authority_id (string, may represent a URI) | |
| takes an ArchivesSnake resource object | |
| returns a list of dicts | |
| """ | |
| creators_and_sources = [] | |
| for agent in resource.linked_agents: | |
| for role in agent.linked_agent_roles: | |
| if role in ["creator", "source"]: | |
| creators_and_sources.append({ | |
| "name": | |
| agent.display_name.sort_name, | |
| "source": | |
| agent.display_name.source, | |
| "role": | |
| role, | |
| "authority_id": | |
| agent.display_name.json().get("authority_id") | |
| }) | |
| return creators_and_sources | |
| if __name__ == '__main__': | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
.archivessnake.ymlfile in home directory, containingbaseurl,username, andpassword