Skip to content

Instantly share code, notes, and snippets.

@ross-spencer
Last active July 22, 2020 15:51
Show Gist options
  • Save ross-spencer/8e8ba4a67920c31e39e57ac316b2fa0d to your computer and use it in GitHub Desktop.
Save ross-spencer/8e8ba4a67920c31e39e57ac316b2fa0d to your computer and use it in GitHub Desktop.
METS/PREMIS as a table... (output plus code)
+----+-----------+----------------------------------------------------------------------------------------------------+
| | file_id | path |
|----+-----------+----------------------------------------------------------------------------------------------------|
| 0 | 81dad539 | objects/799px-Euroleague-LE_Roma_vs_Toulouse_IC-27.bmp |
| 1 | 8741d271 | objects/BBhelmet.ai |
| 2 | 98542eaa | objects/G31DS.TIF |
| 3 | c96e5ed0 | objects/Nemastylis_geminiflora_Flower.PNG |
| 4 | 9b48de44 | objects/Vector.NET-Free-Vector-Art-Pack-28-Freedom-Flight.eps |
| 5 | 74cf3a87 | objects/WFPC01.GIF |
| 6 | b1a98d54 | objects/lion.svg |
| 7 | 272de308 | objects/oakland03.jp2 |
| 8 | 81fcc6a2 | objects/pictures/Landing_zone.jpg |
| 9 | b9d9689c | objects/pictures/MARBLES.TGA |
| 10 | 6c358b94 | objects/submissionDocumentation/transfer-Images2-a7291be4-dd33-4642-ba87-c6e2867e61db/METS.xml |
| 11 | e612628e | objects/799px-Euroleague-LE_Roma_vs_Toulouse_IC-27-e612628e-81e0-444b-8024-6c3c9ca6543f.tif |
| 12 | 1267e234 | objects/G31DS-1267e234-3ef3-4565-a106-98f066af6d94.tif |
| 13 | 154480d3 | objects/Vector.NET-Free-Vector-Art-Pack-28-Freedom-Flight-154480d3-e096-4826-990e-5d31ccd5675a.svg |
| 14 | c55e8d8b | objects/lion-c55e8d8b-c193-4e05-9601-4111416c40f1.svg |
| 15 | 1cdfe23c | objects/oakland03-1cdfe23c-e024-4a2c-a285-81c08de5a728.tif |
| 16 | b2bf6cca | objects/pictures/MARBLES-b2bf6cca-075f-4d5f-ae3b-b586a1290d02.tif |
+----+-----------+----------------------------------------------------------------------------------------------------+
+----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------+
| | event_type | outcome | detail | file_count | linked_uuids |
|----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------|
| 0 | ingestion | | | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c |
| 1 | message digest calculation | | program="python" | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c |
| 2 | message digest calculation | | program="python" | 6 | e612628e, 1267e234, 154480d3, c55e8d8b, 1cdfe23c, b2bf6cca |
| 3 | message digest calculation | | program="python" | 1 | 6c358b94 |
| 4 | virus check | Pass | program="ClamAV (clamd)" | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c |
| 5 | format identification | Positive | program="Siegfried" | 8 | 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c |
| 6 | format identification | Tentative | program="Siegfried" | 2 | 81dad539, 8741d271 |
| 7 | validation | pass | program="JHOVE" | 0 | |
| 8 | name cleanup | | | 0 | |
| 9 | normalization | | program="convert" | 2 | 81dad539, e612628e |
| 10 | validation | pass | program="JHOVE" | 0 | |
| 11 | validation | pass | program="JHOVE" | 0 | |
| 12 | normalization | | program="convert" | 2 | 98542eaa, 1267e234 |
| 13 | normalization | | program="inkscape" | 2 | 9b48de44, 154480d3 |
| 14 | validation | pass | program="JHOVE" | 0 | |
| 15 | normalization | | program="inkscape" | 2 | 8a102bfe, c55e8d8b |
| 16 | validation | partial pass | program="JHOVE" | 0 | |
| 17 | validation | pass | program="JHOVE" | 0 | |
| 18 | normalization | | | 2 | 655232ed, 1cdfe23c |
| 19 | name cleanup | | | 0 | |
| 20 | validation | pass | program="JHOVE" | 0 | |
| 21 | validation | pass | program="JHOVE" | 0 | |
| 22 | normalization | | program="convert" | 2 | b2bf6cca, b2bf6cca |
| 23 | creation | | | 0 | |
+----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------+
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import print_function
import argparse
import logging
import sys
import lxml
import metsrw
import pandas as pd
from tabulate import tabulate
import ns
LOGFORMAT = "%(asctime)-15s %(levelname)s: %(message)s"
DATEFORMAT = "%m/%d/%Y %H:%M:%S"
OFFLINE_ERR = "The QName value '{http://www.w3.org/1999/xlink}simpleLink' does not resolve to a(n) attribute group definition."
def validate_mets(mets):
try:
is_valid, report = metsrw.xsd_validate(mets.serialize())
if is_valid is not True:
print("Validation result. {}".format(is_valid))
print("Reason: {}".format(report))
else:
print("Schema validation via XSD is valid.")
except lxml.etree.XMLSchemaParseError as e:
if (OFFLINE_ERR) in str(e):
logging.info("We're likely offline, so ignoring validation")
else:
logging.error(e)
def load_mets(filename):
try:
mets = metsrw.METSDocument.fromfile(filename) # Reads a file
return mets
except lxml.etree.XMLSyntaxError as e:
logging.error("METS %s", e)
sys.exit(1)
except IOError as e:
logging.error("File does not exist %s", e)
sys.exit(1)
def load_via_lxml(filename):
tree = None
with open(filename, "r") as mets:
tree = lxml.etree.parse(mets)
agents = tree.findall(".//premis:agent", ns.METS_NSMAP)
logging.info("Count (PREMIS Agents): %s", len(agents))
objects = tree.findall(".//premis:object", ns.METS_NSMAP)
logging.info("Count (PREMIS Objects): %s", len(objects))
events = tree.findall(".//premis:event", ns.METS_NSMAP)
logging.info("Count (PREMIS Events): %s", len(events))
"""
Aggregate Events:
* ingestion,
* message digest calculation,
* virus check,
* format identification
"""
col1 = []
col2 = []
col3 = []
col4 = []
col5 = []
for premis_event_el in tree.findall(".//premis:event", ns.METS_NSMAP):
items = []
etype = premis_event_el.find("premis:eventType", ns.METS_NSMAP).text
detail = premis_event_el.find(
"premis:eventDetailInformation/premis:eventDetail", ns.METS_NSMAP
).text
outcome = premis_event_el.find(
"premis:eventOutcomeInformation/premis:eventOutcome", ns.METS_NSMAP
).text
try:
_ = premis_event_el.find(
"premis:eventOutcomeInformation/premis:eventOutcomeDetail/premis:eventOutcomeDetailNote",
ns.METS_NSMAP,
).text
except AttributeError:
pass
# Normalize details.
new_detail = ""
if detail is not None:
dl = detail.split(";")
for deet in dl:
if deet.strip().startswith("program"):
new_detail = deet.strip()
# Retrieve linked objects.
linking_obj = premis_event_el.findall(
"premis:linkingObjectIdentifier", ns.METS_NSMAP
)
if linking_obj is not None and len(linking_obj):
items = []
for objs in linking_obj:
items.append(
objs.find(
"premis:linkingObjectIdentifierValue", ns.METS_NSMAP
).text[:8]
)
# Normalize outcome.
if outcome is None:
outcome = ""
# Create our data columns.
col1.append(etype.strip())
col2.append(outcome.strip())
col3.append(len(items))
col4.append(", ".join(items))
# Optional.
col5.append(new_detail)
df_events = pd.DataFrame(
{
"event_type": col1,
"outcome": col2,
"detail": col5,
"file_count": col3,
"linked_uuids": col4,
}
)
files = tree.findall(".//mets:fileSec/mets:fileGrp/mets:file", ns.METS_NSMAP)
locat = tree.findall(
".//mets:fileSec/mets:fileGrp/mets:file/mets:FLocat", ns.METS_NSMAP
)
id_col = []
path_col = []
for i, f in enumerate(files):
id_col.append(f.get("ID").replace("file-", "")[:8])
path_col.append(locat[i].get("{http://www.w3.org/1999/xlink}href"))
logging.info("Number of files: %s", len(path_col))
df_index = pd.DataFrame({"file_id": id_col, "path": path_col})
print(tabulate(df_events, headers="keys", tablefmt="psql"))
print(tabulate(df_index, headers="keys", tablefmt="psql"))
return
def use_mets(filename):
try:
mets = load_mets(filename)
validate_mets(mets)
return True
except IndexError:
return False
except metsrw.exceptions.ParseError as err:
logging.error("Problem parsing the METS with METSRW: %s", err)
return False
def main():
parser = argparse.ArgumentParser(description="metsrw client to test validation")
parser.add_argument(
"mets", metavar="M", type=str, nargs=1, help="a mets file to parse"
)
parser.add_argument(
"--logging",
type=str,
nargs="?",
default="DEBUG",
help="logging level, INFO, DEBUG, WARNING, ERROR",
)
args = parser.parse_args()
if args.logging not in ["INFO", "DEBUG", "WARNING", "ERROR"]:
logging.basicConfig(format=LOGFORMAT, datefmt=DATEFORMAT, level="DEBUG")
else:
logging.basicConfig(format=LOGFORMAT, datefmt=DATEFORMAT, level=args.logging)
loaded = use_mets(args.mets[0])
if not loaded:
logging.info("Using LXML instead...")
load_via_lxml(args.mets[0])
if __name__ == "__main__":
main()
# -*- coding: utf-8 -*-
# Namespace map for parsing METS XML.
METS_NSMAP = {
"mets": "http://www.loc.gov/METS/",
"premis": "http://www.loc.gov/premis/v3",
"dc": "http://purl.org/dc/elements/1.1/",
"dcterms": "http://purl.org/dc/terms/",
"xlink": "http://www.w3.org/1999/xlink",
}
EVENTS = {
"CR": "creation",
"FI": "format identification",
"IN": "ingestion",
"MDC": "message digest calculation",
"NC": "name cleanup",
"NO": "normalization",
"VA": "validation",
"VC": "virus check",
}
06/30/2020 13:23:11 ERROR: Problem parsing the METS with METSRW: file-9663caa2-d84e-43a4-af30-54668bf6729e exists in structMap but not fileSec
06/30/2020 13:23:11 INFO: Using LXML instead...
06/30/2020 13:23:11 INFO: Count (PREMIS Agents): 3
06/30/2020 13:23:11 INFO: Count (PREMIS Objects): 19
06/30/2020 13:23:11 INFO: Count (PREMIS Events): 24
06/30/2020 13:23:11 INFO: Number of files: 17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment