Last active
January 19, 2023 00:48
-
-
Save pkoppstein/addaedbb10b6fc97ff2b6b00123700ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module { | |
"name": "xml", | |
"description": "PEG parser for XML", | |
"version": "0.0.1", | |
"homepage": "https://gist.github.com/pkoppstein/addaedbb10b6fc97ff2b6b00123700ad", | |
"license": "MIT", | |
"author": "pkoppstein at gmail dot com" | |
}; | |
# This is a standalone jq module that has been tested with jq, gojq, and fq. | |
# See the end of this file for example invocations. | |
# The main goal of this XML parser is to translate valid XML documents | |
# into valid JSON losslessly, not to check for validity. Thus the | |
# <?xml ... ?> header is optional, and "white space" is preserved when | |
# significant in accordance with the XML specification. However, a | |
# filter, `jsonify`, is provided for removing strings of the form | |
# '\n *$' in the "text" portions of the XML document. This filter also | |
# converts hex character codes of the form `&#x....;' to the | |
# corresponding character, e.g. "Émily" -> "Émily". | |
# Since "duplicate attribute names within a tag are not permitted with XML", | |
# we can group the attributes within a tag as a JSON object, as jq respects key ordering. | |
# Also, since XML tags cannot begin with `@`, PROLOG is rendered as a | |
# JSON object with key "@PROLOG" and likewise for COMMENT, DTD and CDATA. | |
# Consecutive attribute-value pairs are grouped together under "@attributes". | |
# The grammar is primarily adapted from: | |
# (1) https://peerj.com/preprints/1503/ | |
# (2) https://cs.lmu.edu/~ray/notes/xmlgrammar/ | |
# with the notable exception that (1) forgets to allow comments. | |
# Caveats | |
# 1) It has not been determined whether there are valid XML documents | |
# which this parser would not recognize. | |
# 2) It has not been determined whether XML comments will always be recognized as such. | |
# Note that XML disallows comments: | |
# . before the XML declaration and within comments | |
# . within attribute values | |
# Note also that in the XML grammar, `Name` cannot begin with "@" or "." per (2), | |
# which defines Name as follows: | |
# NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender | |
# Name ::= (Letter | '_' | ':') (NameChar)* | |
######################################################### | |
# PEG-to-jq transcription is based on these equivalences: | |
# Sequence: e1 e2 e1 | e2 | |
# Ordered choice: e1 / e2 e1 // e2 | |
# Zero-or-more: e* star(E) | |
# One-or-more: e+ plus(E) | |
# Optional: e? optional(E) | |
# And-predicate: &e amp(E) # no input is consumed | |
# Not-predicate: !e neg(E) # no input is consumed | |
# The idea is to pass a JSON object {remainder:_, result:_ } through a | |
# pipeline, consuming the text in .remainder and building up .result. | |
def star(E): ((E | star(E)) // .) ; | |
def plus(E): E | (plus(E) // . ); | |
def optional(E): (E // .); | |
def amp(E): . as $in | E | $in; | |
def neg(E): select( [E] == [] ); | |
### Helper functions: | |
# Consume a regular expression rooted at the start of .remainder, or emit empty; | |
# on success, update .remainder and set .match but do NOT update .result | |
def consume($re): | |
# on failure, match yields empty | |
(.remainder | match("^" + $re)) as $match | |
| .remainder |= .[$match.length :] | |
| .match = $match.string; | |
def parse($re): | |
consume($re) | |
| .result = .result + [.match] ; | |
def parseNumber($re): | |
consume($re) | |
| .result = .result + [.match|tonumber] ; | |
# consume the literal string $s | |
def q($s): | |
select(.remainder | startswith($s)) | |
| .remainder |= .[$s | length :] ; | |
def literal($s): | |
q($s) | |
| .result += [$s]; | |
def nonempty: select( (.remainder | length) > 0 ); | |
def eos: select(.remainder | length == 0); | |
# required white space | |
def _: consume("[ \n\r\t]+"); | |
# optional white space | |
def ws: consume("[ \n\r\t]*"); | |
# Tagging | |
def box(E): | |
((.result = null) | E) as $e | |
| .remainder = $e.remainder | |
| .result += [$e.result] # the magic sauce | |
; | |
def box(name; E): | |
((.result = null) | E) as $e | |
| .remainder = $e.remainder | |
| .result += [{(name): (try ($e.result|join("")) catch $e.result) }] # the magic sauce | |
; | |
# A string that does NOT contain $regex | |
def string_except($regex): | |
box(star(neg( parse($regex) ) | parse("."))) | .result[-1] |= add; | |
def objectify(E): | |
box(E) | |
| .result[-1] |= {(.[0]): .[1:]} ; | |
def keyvalue(E): | |
box(E) | |
| .result[-1] |= {(.[0]): .[1]} ; | |
######################################################### | |
def XML: | |
def _ : consume("[ \n\r\t]"); # exactly one | |
def String : ((consume("\"") | parse("[^\"]*") | consume("\"")) // | |
(consume("'") | parse("[^']*") | consume("'"))); | |
def CDataSec : box("@CDATA"; q("<![CDATA[") | string_except("]]>") | q("]]>") ) | ws; | |
def PROLOG : box("@PROLOG"; q("<?xml") | string_except("\\?>") | q("?>")); | |
def DTD : box("@DTD"; q("<!") | parse("[^>]") | q(">")); | |
# The XML spec specifically disallows double-hyphen within comments | |
def COMMENT : box("@COMMENT"; q("<!--") | string_except("--") | q("-->")); | |
def CharData : parse("[^<]+"); # `<` and '&' are disallowed per W3C but entity references require '&' | |
# This is more permissive than required: | |
def Name : parse("[A-Za-z:_][^/=<>\n\r\t ]*"); | |
def Attribute : keyvalue(Name | ws | q("=") | ws | String | ws); | |
def Attributes: box( plus(Attribute) ) | .result[-1] |= {"@attributes": add} ; | |
# <foo> must be matched with </foo> | |
def Element : | |
def Content : star(Element // CDataSec // CharData // COMMENT); | |
objectify( q("<") | |
| Name | |
| .result[-1] as $name | |
| ws | |
| (Attributes // ws) | |
| ( (q("/>") | |
// (q(">") | Content | q("</") | q($name) | ws | q(">"))) | |
| ws) ) ; | |
{remainder: . } | |
| ws | |
| optional(PROLOG) | ws | |
| optional(DTD) | ws | |
| star(COMMENT | ws) | |
| Element | ws # for HTML, one would use star(Element) here | |
| star(COMMENT | ws) | |
| .result; | |
def hex2i: | |
def toi: if . >= 87 then .-87 else . - 48 end; | |
reduce ( ascii_downcase | explode | map(toi) | reverse[]) as $i ([1, 0]; # [power, sum] | |
.[1] += $i * .[0] | |
| .[0] *= 16 ) | |
| .[1]; | |
def hexcode2json: | |
gsub("&#x(?<x>....);" ; .x | [hex2i] | implode) ; | |
def jsonify: | |
walk( if type == "array" | |
then map(select(type == "string" and test("^\n *$") | not)) | |
elif type == "string" then hexcode2json | |
else . end); | |
# Usage: | |
# The following examples assumes that this module is named xml.jq | |
# jq -R 'include "xml" {search: "."}; XML' | |
# jq -R 'include "xml"; XML | jsonify[]' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment