Skip to content

Instantly share code, notes, and snippets.

@birkin
Last active August 19, 2024 18:15
Show Gist options
  • Save birkin/79febf4b870df2f9bd1cfe889dc71df3 to your computer and use it in GitHub Desktop.
Save birkin/79febf4b870df2f9bd1cfe889dc71df3 to your computer and use it in GitHub Desktop.
check xml catalog
"""
Validates against internal mods xml-schema, indicating whether xmlcatalog is used.
"""
import os, unittest
from lxml import etree
import requests
class StrictCatalogResolver(etree.Resolver):
def resolve(self, system_url, public_id, context):
# Try to resolve using the catalog only
resolved_url = self.resolve_filename(system_url, context)
if resolved_url:
return self.resolve_filename(resolved_url, context)
else:
raise ValueError(f"Failed to resolve {system_url} using the catalog")
class TestXMLCatalogProcessing(unittest.TestCase):
def setUp(self):
# Set up the environment variable to point to the XML catalog if needed
os.environ['XML_CATALOG_FILES'] = '/foo/catalog.xml'
def test_schema_validation_from_root(self):
## Super-minimal MODS with schema location in the root element
## bad MODS; <blah> is not a valid element
# xml_content = '''
# <mods xmlns="http://www.loc.gov/mods/v3"
# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
# xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-7.xsd">
# <titleInfo>
# <blah>Example Title</blah>
# </titleInfo>
# </mods>
# '''
## good MODS
xml_content = '''
<mods xmlns="http://www.loc.gov/mods/v3"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-7.xsd">
<titleInfo>
<!-- <blah>Example Title</blah> -->
<title>Example Title</title>
</titleInfo>
</mods>
'''
## Parse the XML --------------------------------------------
parser = etree.XMLParser( no_network=True ) # no_network=False allows the parser to fetch the schema from the network
parser.resolvers.add(StrictCatalogResolver())
xml_doc = etree.fromstring( xml_content.encode('utf-8'), parser )
## Get the schema url from the root element -----------------
schema_locations = xml_doc.attrib['{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'].split() # the url in the brackets represents the xsi namespace
schema_url = schema_locations[1] # the second item is the actual schema URL
## load the schema ------------------------------------------
try:
# Attempt to parse the schema using the strict resolver
schema_doc = etree.parse( schema_url, parser )
print(etree.tostring(schema_doc, pretty_print=True).decode('utf-8'))
print('Schema loaded from the catalog')
except ValueError as e:
print(f'Error: {e}')
print('Schema could not be resolved from the catalog, and no fallback was allowed.')
## validate -------------------------------------------------
validity = xmlschema.validate( xml_doc )
if validity == True:
print( 'XML passed validation against the schema' )
else:
print( f'XML failed validation against the schema; errors, ``{xmlschema.error_log}``' )
self.assertEqual( True, validity )
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment