Last active
May 19, 2023 13:59
-
-
Save stanio/62b3869518c5a060d006166b305068da to your computer and use it in GitHub Desktop.
Fast XML document type detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This module, both source code and documentation, | |
* is in the Public Domain, and comes with NO WARRANTY. | |
*/ | |
//package ; | |
import static javax.xml.XMLConstants.XMLNS_ATTRIBUTE; // requires java.xml | |
// requires java.base | |
import java.io.IOException; | |
import java.io.StringReader; | |
// requires java.xml | |
import javax.xml.XMLConstants; | |
import javax.xml.parsers.FactoryConfigurationError; | |
import javax.xml.parsers.ParserConfigurationException; | |
import javax.xml.parsers.SAXParserFactory; | |
import org.xml.sax.Attributes; | |
import org.xml.sax.InputSource; | |
import org.xml.sax.Locator; | |
import org.xml.sax.SAXException; | |
import org.xml.sax.SAXNotRecognizedException; | |
import org.xml.sax.SAXNotSupportedException; | |
import org.xml.sax.XMLReader; | |
import org.xml.sax.ext.DefaultHandler2; | |
import org.xml.sax.ext.Locator2; | |
import org.xml.sax.helpers.AttributesImpl; | |
/** | |
* Encapsulates the prolog ({@code xml} and {@code DOCTYPE} declarations, if | |
* any) and root element information of an XML document used to identify its | |
* content type. | |
* | |
* @see <a href="https://www.w3.org/TR/xml/">Extensible Markup Language</a> | |
*/ | |
public class XMLDoctype { | |
private static ThreadLocal<XMLPrologHandler> | |
localHandler = new ThreadLocal<XMLPrologHandler>() { | |
@Override protected XMLPrologHandler initialValue() { | |
return new XMLPrologHandler(); | |
} | |
}; | |
String xmlVersion; | |
String encoding; | |
String name; | |
String publicId; | |
String systemId; | |
String rootQName; | |
Attributes rootAttributes; | |
public XMLDoctype() { | |
// empty | |
} | |
/** | |
* The returned {@code XMLDoctype} is guaranteed to have root element info | |
* available. If parsing up to incl. the start tag of the root element is | |
* not successful a {@code SAXException} (or possibly {@code IOException}) | |
* will be thrown. | |
* <p> | |
* The successful return of an {@code XMLDoctype} doesn't guarantee the | |
* full document is <a href="https://www.w3.org/TR/xml/#sec-well-formed" | |
* >well-formed</a>.</p> | |
* | |
* REVISIT: Note that external entities are not processed, and maybe allow | |
* {@code DOCTYPE} without root element to "successfully" handle: | |
* <pre> | |
* {@code <!DOCTYPE foo SYSTEM "..."> | |
* &bar;}</pre> | |
* <p> | |
* where {@code &bar;} is declared in the DTD external subset.</p> | |
* | |
* @param source the input source to parse | |
* @return a {@code XMLDoctype} with at least root element info available | |
* @throws IOException if I/O error occurs | |
* @throws SAXException if XML parsing error occurs | |
*/ | |
public static XMLDoctype of(InputSource source) throws IOException, SAXException { | |
// REVISIT: Less likely scenario but limit the source data, so we don't | |
// end up reading megabytes of misc. content (comments and PIs) before, | |
// if ever, reaching root document element. | |
return localHandler.get().parse(source); | |
} | |
public static XMLDoctype of(String url) throws IOException, SAXException { | |
return of(new InputSource(url)); | |
} | |
public String getXmlVersion() { | |
return xmlVersion; | |
} | |
public String getEncoding() { | |
return encoding; | |
} | |
public String getName() { | |
return name; | |
} | |
public String getPublicId() { | |
return publicId; | |
} | |
public String getSystemId() { | |
return systemId; | |
} | |
public String getRootNamespace() { | |
int colonIndex = rootQName.indexOf(':'); | |
if (colonIndex < 0) { | |
return rootAttributes.getValue(XMLNS_ATTRIBUTE); | |
} | |
return rootAttributes.getValue(XMLNS_ATTRIBUTE | |
+ ':' + rootQName.substring(0, colonIndex)); | |
} | |
public String getRootLocalName() { | |
int colonIndex = rootQName.lastIndexOf(':'); | |
return colonIndex < 0 ? rootQName | |
: rootQName.substring(colonIndex + 1); | |
} | |
public String getRootQName() { | |
return rootQName; | |
} | |
public Attributes getRootAttributes() { | |
return rootAttributes; | |
} | |
@Override | |
public String toString() { | |
return "Doctype(xmlVersion=" + xmlVersion | |
+ ", encoding=" + encoding | |
+ ", name=" + name | |
+ ", publicId=" + publicId | |
+ ", systemId=" + systemId | |
+ ", rootQName=" + rootQName | |
+ ", rootAttributes=" + rootAttributes + ")"; | |
} | |
} // class XMLDoctype | |
class XMLPrologHandler extends DefaultHandler2 { | |
private static SAXParserFactory saxParserFactory; | |
private XMLDoctype doctype; | |
private Locator locator; | |
private XMLReader xmlReader; | |
public XMLPrologHandler() { | |
try { | |
synchronized (XMLPrologHandler.class) { | |
xmlReader = saxParserFactory().newSAXParser().getXMLReader(); | |
} | |
} catch (SAXException | ParserConfigurationException e) { | |
throw new IllegalStateException(e); | |
} | |
xmlReader.setContentHandler(this); | |
xmlReader.setErrorHandler(this); | |
xmlReader.setEntityResolver(this); | |
try { | |
xmlReader.setProperty("http://xml.org/sax/properties/" | |
+ "lexical-handler", this); | |
} catch (SAXNotRecognizedException | SAXNotSupportedException e) { | |
// Optional | |
} | |
} | |
private static SAXParserFactory saxParserFactory() { | |
if (saxParserFactory == null) { | |
try { | |
SAXParserFactory spf = SAXParserFactory.newInstance(); | |
spf.setNamespaceAware(false); | |
spf.setValidating(false); | |
spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); | |
saxParserFactory = spf; | |
} catch (SAXException | ParserConfigurationException e) { | |
throw new FactoryConfigurationError(e); | |
} | |
} | |
return saxParserFactory; | |
} | |
public XMLDoctype parse(InputSource source) throws IOException, SAXException { | |
XMLDoctype doctype = this.doctype = new XMLDoctype(); | |
try { | |
xmlReader.parse(source); | |
} catch (StopParseException e) { | |
// Found root element | |
} finally { | |
this.locator = null; | |
this.doctype = null; | |
} | |
return doctype; | |
} | |
@Override | |
public void setDocumentLocator(Locator locator) { | |
this.locator = locator; | |
} | |
@Override | |
public void startDTD(String name, String publicId, String systemId) | |
throws SAXException { | |
doctype.name = name; | |
doctype.publicId = publicId; | |
doctype.systemId = systemId; | |
} | |
@Override | |
public void startElement(String uri, | |
String localName, | |
String qName, | |
Attributes attributes) | |
throws SAXException { | |
doctype.rootQName = qName; | |
doctype.rootAttributes = new AttributesImpl(attributes); | |
if (locator instanceof Locator2) { | |
Locator2 locator2 = (Locator2) locator; | |
doctype.xmlVersion = locator2.getXMLVersion(); | |
doctype.encoding = locator2.getEncoding(); | |
} | |
throw StopParseException.INSTANCE; | |
} | |
@Override | |
public InputSource resolveEntity(String name, | |
String publicId, | |
String baseURI, | |
String systemId) { | |
// Don't resolve any external entities – just replace with empty | |
// content. A more general accessExternalDTD="" setup. | |
return new InputSource(new StringReader("")); | |
} | |
/** | |
* Thrown from content handlers to stop parsing eagerly. Doesn't signify | |
* an exception per se. Should be handled explicitly around parse: | |
* <pre> | |
* XMLReader parser; | |
* ... | |
* try { | |
* parser.parse(...); | |
* } catch (StopParseException e) { | |
* // Not an error. Content analysis has completed | |
* // before reaching the end of the document. | |
* }</pre> | |
*/ | |
@SuppressWarnings("serial") | |
private static class StopParseException extends SAXException { | |
/** | |
* Shared instance to avoid the overhead of object creation for each | |
* stop signal. Doesn't have and doesn't fill in stack traces. | |
*/ | |
static final StopParseException INSTANCE = new StopParseException(); | |
private StopParseException() { | |
super("Parsing stopped from content handler"); | |
super.initCause(null); // Prevent overwriting the cause | |
} | |
@Override | |
public Throwable fillInStackTrace() { | |
return this; // Don't fill in stack trace | |
} | |
} // class StopParseException | |
} // class XMLPrologHandler |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment