Last active
July 22, 2020 15:51
-
-
Save ross-spencer/8e8ba4a67920c31e39e57ac316b2fa0d to your computer and use it in GitHub Desktop.
METS/PREMIS as a table... (output plus code)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+----+-----------+----------------------------------------------------------------------------------------------------+ | |
| | file_id | path | | |
|----+-----------+----------------------------------------------------------------------------------------------------| | |
| 0 | 81dad539 | objects/799px-Euroleague-LE_Roma_vs_Toulouse_IC-27.bmp | | |
| 1 | 8741d271 | objects/BBhelmet.ai | | |
| 2 | 98542eaa | objects/G31DS.TIF | | |
| 3 | c96e5ed0 | objects/Nemastylis_geminiflora_Flower.PNG | | |
| 4 | 9b48de44 | objects/Vector.NET-Free-Vector-Art-Pack-28-Freedom-Flight.eps | | |
| 5 | 74cf3a87 | objects/WFPC01.GIF | | |
| 6 | b1a98d54 | objects/lion.svg | | |
| 7 | 272de308 | objects/oakland03.jp2 | | |
| 8 | 81fcc6a2 | objects/pictures/Landing_zone.jpg | | |
| 9 | b9d9689c | objects/pictures/MARBLES.TGA | | |
| 10 | 6c358b94 | objects/submissionDocumentation/transfer-Images2-a7291be4-dd33-4642-ba87-c6e2867e61db/METS.xml | | |
| 11 | e612628e | objects/799px-Euroleague-LE_Roma_vs_Toulouse_IC-27-e612628e-81e0-444b-8024-6c3c9ca6543f.tif | | |
| 12 | 1267e234 | objects/G31DS-1267e234-3ef3-4565-a106-98f066af6d94.tif | | |
| 13 | 154480d3 | objects/Vector.NET-Free-Vector-Art-Pack-28-Freedom-Flight-154480d3-e096-4826-990e-5d31ccd5675a.svg | | |
| 14 | c55e8d8b | objects/lion-c55e8d8b-c193-4e05-9601-4111416c40f1.svg | | |
| 15 | 1cdfe23c | objects/oakland03-1cdfe23c-e024-4a2c-a285-81c08de5a728.tif | | |
| 16 | b2bf6cca | objects/pictures/MARBLES-b2bf6cca-075f-4d5f-ae3b-b586a1290d02.tif | | |
+----+-----------+----------------------------------------------------------------------------------------------------+ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------+ | |
| | event_type | outcome | detail | file_count | linked_uuids | | |
|----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------| | |
| 0 | ingestion | | | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c | | |
| 1 | message digest calculation | | program="python" | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c | | |
| 2 | message digest calculation | | program="python" | 6 | e612628e, 1267e234, 154480d3, c55e8d8b, 1cdfe23c, b2bf6cca | | |
| 3 | message digest calculation | | program="python" | 1 | 6c358b94 | | |
| 4 | virus check | Pass | program="ClamAV (clamd)" | 10 | 81dad539, 8741d271, 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c | | |
| 5 | format identification | Positive | program="Siegfried" | 8 | 98542eaa, c96e5ed0, 9b48de44, 74cf3a87, b1a98d54, 272de308, 81fcc6a2, b9d9689c | | |
| 6 | format identification | Tentative | program="Siegfried" | 2 | 81dad539, 8741d271 | | |
| 7 | validation | pass | program="JHOVE" | 0 | | | |
| 8 | name cleanup | | | 0 | | | |
| 9 | normalization | | program="convert" | 2 | 81dad539, e612628e | | |
| 10 | validation | pass | program="JHOVE" | 0 | | | |
| 11 | validation | pass | program="JHOVE" | 0 | | | |
| 12 | normalization | | program="convert" | 2 | 98542eaa, 1267e234 | | |
| 13 | normalization | | program="inkscape" | 2 | 9b48de44, 154480d3 | | |
| 14 | validation | pass | program="JHOVE" | 0 | | | |
| 15 | normalization | | program="inkscape" | 2 | 8a102bfe, c55e8d8b | | |
| 16 | validation | partial pass | program="JHOVE" | 0 | | | |
| 17 | validation | pass | program="JHOVE" | 0 | | | |
| 18 | normalization | | | 2 | 655232ed, 1cdfe23c | | |
| 19 | name cleanup | | | 0 | | | |
| 20 | validation | pass | program="JHOVE" | 0 | | | |
| 21 | validation | pass | program="JHOVE" | 0 | | | |
| 22 | normalization | | program="convert" | 2 | b2bf6cca, b2bf6cca | | |
| 23 | creation | | | 0 | | | |
+----+----------------------------+--------------+--------------------------+--------------+----------------------------------------------------------------------------------------------------+ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
import argparse | |
import logging | |
import sys | |
import lxml | |
import metsrw | |
import pandas as pd | |
from tabulate import tabulate | |
import ns | |
LOGFORMAT = "%(asctime)-15s %(levelname)s: %(message)s" | |
DATEFORMAT = "%m/%d/%Y %H:%M:%S" | |
OFFLINE_ERR = "The QName value '{http://www.w3.org/1999/xlink}simpleLink' does not resolve to a(n) attribute group definition." | |
def validate_mets(mets): | |
try: | |
is_valid, report = metsrw.xsd_validate(mets.serialize()) | |
if is_valid is not True: | |
print("Validation result. {}".format(is_valid)) | |
print("Reason: {}".format(report)) | |
else: | |
print("Schema validation via XSD is valid.") | |
except lxml.etree.XMLSchemaParseError as e: | |
if (OFFLINE_ERR) in str(e): | |
logging.info("We're likely offline, so ignoring validation") | |
else: | |
logging.error(e) | |
def load_mets(filename): | |
try: | |
mets = metsrw.METSDocument.fromfile(filename) # Reads a file | |
return mets | |
except lxml.etree.XMLSyntaxError as e: | |
logging.error("METS %s", e) | |
sys.exit(1) | |
except IOError as e: | |
logging.error("File does not exist %s", e) | |
sys.exit(1) | |
def load_via_lxml(filename): | |
tree = None | |
with open(filename, "r") as mets: | |
tree = lxml.etree.parse(mets) | |
agents = tree.findall(".//premis:agent", ns.METS_NSMAP) | |
logging.info("Count (PREMIS Agents): %s", len(agents)) | |
objects = tree.findall(".//premis:object", ns.METS_NSMAP) | |
logging.info("Count (PREMIS Objects): %s", len(objects)) | |
events = tree.findall(".//premis:event", ns.METS_NSMAP) | |
logging.info("Count (PREMIS Events): %s", len(events)) | |
""" | |
Aggregate Events: | |
* ingestion, | |
* message digest calculation, | |
* virus check, | |
* format identification | |
""" | |
col1 = [] | |
col2 = [] | |
col3 = [] | |
col4 = [] | |
col5 = [] | |
for premis_event_el in tree.findall(".//premis:event", ns.METS_NSMAP): | |
items = [] | |
etype = premis_event_el.find("premis:eventType", ns.METS_NSMAP).text | |
detail = premis_event_el.find( | |
"premis:eventDetailInformation/premis:eventDetail", ns.METS_NSMAP | |
).text | |
outcome = premis_event_el.find( | |
"premis:eventOutcomeInformation/premis:eventOutcome", ns.METS_NSMAP | |
).text | |
try: | |
_ = premis_event_el.find( | |
"premis:eventOutcomeInformation/premis:eventOutcomeDetail/premis:eventOutcomeDetailNote", | |
ns.METS_NSMAP, | |
).text | |
except AttributeError: | |
pass | |
# Normalize details. | |
new_detail = "" | |
if detail is not None: | |
dl = detail.split(";") | |
for deet in dl: | |
if deet.strip().startswith("program"): | |
new_detail = deet.strip() | |
# Retrieve linked objects. | |
linking_obj = premis_event_el.findall( | |
"premis:linkingObjectIdentifier", ns.METS_NSMAP | |
) | |
if linking_obj is not None and len(linking_obj): | |
items = [] | |
for objs in linking_obj: | |
items.append( | |
objs.find( | |
"premis:linkingObjectIdentifierValue", ns.METS_NSMAP | |
).text[:8] | |
) | |
# Normalize outcome. | |
if outcome is None: | |
outcome = "" | |
# Create our data columns. | |
col1.append(etype.strip()) | |
col2.append(outcome.strip()) | |
col3.append(len(items)) | |
col4.append(", ".join(items)) | |
# Optional. | |
col5.append(new_detail) | |
df_events = pd.DataFrame( | |
{ | |
"event_type": col1, | |
"outcome": col2, | |
"detail": col5, | |
"file_count": col3, | |
"linked_uuids": col4, | |
} | |
) | |
files = tree.findall(".//mets:fileSec/mets:fileGrp/mets:file", ns.METS_NSMAP) | |
locat = tree.findall( | |
".//mets:fileSec/mets:fileGrp/mets:file/mets:FLocat", ns.METS_NSMAP | |
) | |
id_col = [] | |
path_col = [] | |
for i, f in enumerate(files): | |
id_col.append(f.get("ID").replace("file-", "")[:8]) | |
path_col.append(locat[i].get("{http://www.w3.org/1999/xlink}href")) | |
logging.info("Number of files: %s", len(path_col)) | |
df_index = pd.DataFrame({"file_id": id_col, "path": path_col}) | |
print(tabulate(df_events, headers="keys", tablefmt="psql")) | |
print(tabulate(df_index, headers="keys", tablefmt="psql")) | |
return | |
def use_mets(filename): | |
try: | |
mets = load_mets(filename) | |
validate_mets(mets) | |
return True | |
except IndexError: | |
return False | |
except metsrw.exceptions.ParseError as err: | |
logging.error("Problem parsing the METS with METSRW: %s", err) | |
return False | |
def main(): | |
parser = argparse.ArgumentParser(description="metsrw client to test validation") | |
parser.add_argument( | |
"mets", metavar="M", type=str, nargs=1, help="a mets file to parse" | |
) | |
parser.add_argument( | |
"--logging", | |
type=str, | |
nargs="?", | |
default="DEBUG", | |
help="logging level, INFO, DEBUG, WARNING, ERROR", | |
) | |
args = parser.parse_args() | |
if args.logging not in ["INFO", "DEBUG", "WARNING", "ERROR"]: | |
logging.basicConfig(format=LOGFORMAT, datefmt=DATEFORMAT, level="DEBUG") | |
else: | |
logging.basicConfig(format=LOGFORMAT, datefmt=DATEFORMAT, level=args.logging) | |
loaded = use_mets(args.mets[0]) | |
if not loaded: | |
logging.info("Using LXML instead...") | |
load_via_lxml(args.mets[0]) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Namespace map for parsing METS XML. | |
METS_NSMAP = { | |
"mets": "http://www.loc.gov/METS/", | |
"premis": "http://www.loc.gov/premis/v3", | |
"dc": "http://purl.org/dc/elements/1.1/", | |
"dcterms": "http://purl.org/dc/terms/", | |
"xlink": "http://www.w3.org/1999/xlink", | |
} | |
EVENTS = { | |
"CR": "creation", | |
"FI": "format identification", | |
"IN": "ingestion", | |
"MDC": "message digest calculation", | |
"NC": "name cleanup", | |
"NO": "normalization", | |
"VA": "validation", | |
"VC": "virus check", | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
06/30/2020 13:23:11 ERROR: Problem parsing the METS with METSRW: file-9663caa2-d84e-43a4-af30-54668bf6729e exists in structMap but not fileSec | |
06/30/2020 13:23:11 INFO: Using LXML instead... | |
06/30/2020 13:23:11 INFO: Count (PREMIS Agents): 3 | |
06/30/2020 13:23:11 INFO: Count (PREMIS Objects): 19 | |
06/30/2020 13:23:11 INFO: Count (PREMIS Events): 24 | |
06/30/2020 13:23:11 INFO: Number of files: 17 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment