Last active
August 19, 2024 18:15
-
-
Save birkin/79febf4b870df2f9bd1cfe889dc71df3 to your computer and use it in GitHub Desktop.
check xml catalog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Validates against internal mods xml-schema, indicating whether xmlcatalog is used. | |
""" | |
import os, unittest | |
from lxml import etree | |
import requests | |
class StrictCatalogResolver(etree.Resolver): | |
def resolve(self, system_url, public_id, context): | |
# Try to resolve using the catalog only | |
resolved_url = self.resolve_filename(system_url, context) | |
if resolved_url: | |
return self.resolve_filename(resolved_url, context) | |
else: | |
raise ValueError(f"Failed to resolve {system_url} using the catalog") | |
class TestXMLCatalogProcessing(unittest.TestCase): | |
def setUp(self): | |
# Set up the environment variable to point to the XML catalog if needed | |
os.environ['XML_CATALOG_FILES'] = '/foo/catalog.xml' | |
def test_schema_validation_from_root(self): | |
## Super-minimal MODS with schema location in the root element | |
## bad MODS; <blah> is not a valid element | |
# xml_content = ''' | |
# <mods xmlns="http://www.loc.gov/mods/v3" | |
# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
# xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-7.xsd"> | |
# <titleInfo> | |
# <blah>Example Title</blah> | |
# </titleInfo> | |
# </mods> | |
# ''' | |
## good MODS | |
xml_content = ''' | |
<mods xmlns="http://www.loc.gov/mods/v3" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-7.xsd"> | |
<titleInfo> | |
<!-- <blah>Example Title</blah> --> | |
<title>Example Title</title> | |
</titleInfo> | |
</mods> | |
''' | |
## Parse the XML -------------------------------------------- | |
parser = etree.XMLParser( no_network=True ) # no_network=False allows the parser to fetch the schema from the network | |
parser.resolvers.add(StrictCatalogResolver()) | |
xml_doc = etree.fromstring( xml_content.encode('utf-8'), parser ) | |
## Get the schema url from the root element ----------------- | |
schema_locations = xml_doc.attrib['{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'].split() # the url in the brackets represents the xsi namespace | |
schema_url = schema_locations[1] # the second item is the actual schema URL | |
## load the schema ------------------------------------------ | |
try: | |
# Attempt to parse the schema using the strict resolver | |
schema_doc = etree.parse( schema_url, parser ) | |
print(etree.tostring(schema_doc, pretty_print=True).decode('utf-8')) | |
print('Schema loaded from the catalog') | |
except ValueError as e: | |
print(f'Error: {e}') | |
print('Schema could not be resolved from the catalog, and no fallback was allowed.') | |
## validate ------------------------------------------------- | |
validity = xmlschema.validate( xml_doc ) | |
if validity == True: | |
print( 'XML passed validation against the schema' ) | |
else: | |
print( f'XML failed validation against the schema; errors, ``{xmlschema.error_log}``' ) | |
self.assertEqual( True, validity ) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment