Created
August 19, 2021 18:18
-
-
Save ebenenglish/0ea5ce0b7728dcbd9f6da1e939903ed1 to your computer and use it in GitHub Desktop.
A set of scripts to query the Preservica REST API and create an OAI-PMH response XML file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a static OAI XML file with all records | |
# @param sets [Array] collection ids, e.g. ['c1ee8010-fb87-40d6-ac23-be547344c4f2', '2000d071-a976-4536-b173-8772a11c3588', ...] | |
# @param units_to_ignore [Array] ids of objects to ignore | |
# @param file_path [String] location where static XML file should be written | |
# @param credentials [Hash] Preservica REST API credentials: {un: 'foo', pw: 'bar'} | |
def generate_static_oai_xml(sets, units_to_ignore, file_path, credentials) | |
@oai_doc = initialize_oai_xml | |
@units_to_ignore = units_to_ignore | |
@preservica_rest_api_base = 'https://us.preservica.com/api/entity' | |
@credentials = credentials | |
@missing_mods = [] | |
sets.each do |col_id| | |
col_resp_xml = preservica_rest_to_xml(col_id, 'structural-objects') | |
process_resp_xml(col_resp_xml, col_id) | |
end | |
filename = "#{file_path}/cba_preservica_mods-harvest_#{Time.now.to_i.to_s}.xml" | |
File.open(filename, 'w') { |f| f.write(@oai_doc) } | |
missing_filename = "#{file_path}/cba_preservica_missing-mods_#{Time.now.to_i.to_s}.txt" | |
File.open(missing_filename, 'w') { |f| f.write(@missing_mods) } | |
end | |
# call the Preservica REST API and return the response as Nokogiri XML object | |
# @param id [String] identifier of the object | |
# @param obj_type [String] the type of object ('structural-object', etc) | |
# @param include_children [Boolean] use to add '/children' to the URL | |
# @return [Nokogiri::XML::Document] | |
def preservica_rest_to_xml(id, obj_type, include_children = true) | |
url = "#{@preservica_rest_api_base}/#{obj_type}/#{id}" | |
url << '/children' if include_children | |
resp = Typhoeus::Request.get(url, | |
userpwd: "#{@credentials[:un]}:#{@credentials[:pw]}") | |
Nokogiri::XML(resp.body) | |
end | |
# process a Preservica REST API response (as Nokogiri XML object) | |
# @param xml_doc [Nokogiri::XML::Document] | |
# @param col_id [String] collection id | |
def process_resp_xml(xml_doc, col_id) | |
entries = get_preservica_children(xml_doc) | |
entries.each do |entry| | |
case entry[:type] | |
when 'information-objects' | |
process_info_object(entry[:id], col_id) | |
when 'structural-objects' | |
collection_resp = preservica_rest_to_xml(entry[:id], 'structural-objects') | |
process_resp_xml(collection_resp, col_id) | |
else | |
raise StandardError, | |
"Unprocessable entry type '#{entry[:type]}' found in: \n#{xml_doc}" | |
end | |
end | |
end | |
# get an information-object from Preservica REST API and process | |
# @param info_obj_id [String] information-object id | |
# @param col_id [String] collection id | |
def process_info_object(info_obj_id, col_id) | |
return if @units_to_ignore.include?(info_obj_id) | |
info_obj_resp_xml = preservica_rest_to_xml(info_obj_id, 'information-objects', false) | |
mods_record = get_info_obj_mods(info_obj_resp_xml, info_obj_id) | |
if mods_record.present? | |
insert_record(@oai_doc, info_obj_id, col_id, mods_record) | |
else | |
@missing_mods << info_obj_id | |
end | |
end | |
# parse a REST API response for child entries | |
# @param xml_doc [Nokogiri::XML::Document] | |
# @return [Array] | |
def get_preservica_children(xml_doc) | |
entries = [] | |
children = xml_doc.xpath('//xmlns:Children/xmlns:Child') | |
children.each do |child| | |
entry_hash = {} | |
entry_hash[:id] = child.attributes['ref']&.value | |
entry_hash[:title] = child.attributes['title']&.value | |
entry_hash[:type] = parse_for_child_type(child) | |
entries << entry_hash | |
end | |
entries | |
end | |
# determine the object type | |
# @param node [Nokogiri::XML::Element] | |
def parse_for_child_type(node) | |
node.text.gsub("#{@preservica_rest_api_base}/", '')[/[a-zA-Z-]*/] | |
end | |
# get the MODS record for an information-object | |
# @param xml_doc [Nokogiri::XML::Document] | |
# @param info_obj_id [String] | |
# @return [Nokogiri::XML::NodeSet] | |
def get_info_obj_mods(xml_doc, info_obj_id) | |
fragments_xpath = '//xmlns:AdditionalInformation/xmlns:Metadata/xmlns:Fragment' | |
fragments = xml_doc.xpath(fragments_xpath) | |
return if fragments.blank? | |
mods_url = fragments.find do |node| | |
node.attributes['schema']&.value == 'http://www.loc.gov/mods/v3' | |
end&.text | |
return unless mods_url | |
metadata_id = mods_url.match(/[0-9a-z-]*\z/).to_s | |
mods_xml = preservica_rest_to_xml("#{info_obj_id}/metadata/#{metadata_id}", | |
'information-objects', false) | |
return unless mods_xml | |
mods_xml.xpath('//mods:mods', 'mods' => 'http://www.loc.gov/mods/v3') | |
end | |
# create an empty OAI ListRecords XML document | |
# @return [Nokogiri::XML::Document] | |
def initialize_oai_xml | |
oai_starter = "<OAI-PMH xmlns=\"http://www.openarchives.org/OAI/2.0/\" | |
xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" | |
xsi:schemaLocation=\"http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd\"> | |
<responseDate>#{Time.now.iso8601}</responseDate> | |
<request verb=\"ListRecords\" metadataPrefix=\"mods\">http://static.digitalcommonwealth.org/cba/oai.xml</request> | |
<ListRecords>" | |
Nokogiri::XML(oai_starter) | |
end | |
# insert a <record> into <ListRecords> in the OAI XML document | |
# @param xml_doc [Nokogiri::XML::Document] | |
# @param file_id [String] object id | |
# @param col_id [String] collection id | |
# @param mods_record [Nokogiri::XML::NodeSet] | |
def insert_record(xml_doc, file_id, col_id, mods_record) | |
records_list = xml_doc.at_xpath('//xmlns:ListRecords') | |
new_record = Nokogiri::XML::DocumentFragment.parse '' | |
Nokogiri::XML::Builder.with(new_record) do |xml| | |
xml.record do | |
xml.header do | |
xml.identifier(file_id) | |
xml.datestamp(Time.now.iso8601) | |
xml.setSpec(col_id) | |
end | |
xml.metadata do | |
xml.parent << mods_record | |
end | |
end | |
end | |
records_list.add_child(new_record) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment