Last active
November 27, 2024 01:35
-
-
Save aperezdc/2222960 to your computer and use it in GitHub Desktop.
Fairly complete LPeg grammar for parsing XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- LPeg-based XML parser. | |
-- | |
-- * Grammar term names are the same as in the XML 1.1 | |
-- specification: http://www.w3.org/TR/xml11/ | |
-- * Action functions are missing. | |
-- | |
-- Copyright (C) 2012 Adrian Perez <[email protected]> | |
-- Distribute under terms of the MIT license. | |
-- | |
local lpeg = require "lpeg" | |
local V, R, S, P = lpeg.V, lpeg.R, lpeg.S, lpeg.P | |
local grammar = { "document" } | |
do local _ENV = grammar | |
-- S ::= (#x20 | #x9 | #xD | #xA)+ | |
SS = (S " \t\r\n")^1 | |
SSopt = SS ^ -1 | |
-- NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | |
NameStartChar = S ":_" + R ("az", "AZ") | |
-- NameChar ::= NameStartChar | "-" | "." | [0-9] | |
NameChar = NameStartChar + S "-." + R "09" | |
-- Name ::= NameStartChar (NameChar)* | |
Name = NameStartChar * NameChar^0 | |
-- Names ::= Name (#x20 Name)* | |
Names = Name * (" " * Name)^0 | |
-- Nmtoken ::= (NameChar)+ | |
Nmtoken = NameChar^1 | |
-- Nmtokens ::= Nmtoken (#x20 Nmtoken)* | |
Nmtokens = Nmtoken * (" " * Nmtoken)^0 | |
-- CharRef ::= '&#' [0-9]+ ';' | |
-- | '&#x' [0-9a-fA-F]+ ';' | |
CharRef = ("&#" * (R "09")^1 * ";") | |
+ ("&#x" * (R ("09", "af", "AF"))^1 * ";") | |
-- EntityRef ::= '&' Name ';' | |
-- PEReference ::= '%' Name ';' | |
-- Reference ::= EntityRef | CharRef | |
-- | |
EntityRef = "&" * Name * ";" | |
PEReference = "%" * Name * ";" | |
Reference = EntityRef + CharRef | |
-- EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | |
-- | "'" ([^%&'] | PEReference | Reference)* "'" | |
EntityValue = ('"' * ((1 - S '%&"') + PEReference + Reference)^0 * '"') | |
+ ("'" * ((1 - S "%&'") + PEReference + Reference)^0 * "'") | |
-- AttValue ::= '"' ([^<&"] | Reference)* '"' | |
-- | "'" ([^<&'] | Reference)* "'" | |
AttValue = ('"' * ((1 - S '<&"') + Reference)^0 * '"') | |
+ ("'" * ((1 - S "<&'") + Reference)^0 * "'") | |
-- SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") | |
SystemLiteral = ('"' * (1 - P '"')^0 * '"') | |
+ ("'" * (1 - P "'")^0 * "'") | |
-- PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] | |
PubidChar = S " \r\n-'()+,./:=?;!*#@$_%" + R ("az", "AZ", "09") | |
-- PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" | |
PubidLiteral = ('"' * PubidChar^0 * '"') | |
+ ("'" * (PubidChar - "'")^0 * "'") | |
-- CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) | |
CharData = (1 - (S "<&" + "]]>"))^0 | |
-- Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' | |
Comment = "<!--" | |
* ((1 - S "-") + ("-" * (1 - S "-")))^0 | |
* "-->" | |
-- PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) | |
PITarget = Name - (S "xX" * S "mM" * S "lL") | |
-- PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' | |
PI = "<?" | |
* PITarget | |
* (SS * (1 - P "?>")^1)^0 | |
* "?>" | |
-- CDSect ::= CDStart CData CDEnd | |
-- CDStart ::= '<![CDATA[' | |
-- CData ::= (Char* - (Char* ']]>' Char*)) | |
-- CDEnd ::= ']]>' | |
CData = (1 - P "]]>")^0 | |
CDSect = "<![CDATA[" * CData * "]]>" | |
-- prolog ::= XMLDecl Misc* (doctypedecl Misc*)? | |
prolog = (V "XMLDecl") ^ -1 | |
* (V "Misc")^0 | |
* (V "doctypedecl" * (V "Misc")^0) ^ -1 | |
-- Eq ::= S? '=' S? | |
Eq = SSopt * "=" * SSopt | |
-- SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) | |
SDDecl = SS | |
* "standalone" | |
* Eq | |
* ( ("'" * (P "yes" + "no") * "'") | |
+ ('"' * (P "yes" + "no") * '"') | |
) | |
-- XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' | |
XMLDecl = "<?xml" | |
* V "VersionInfo" | |
* (V "EncodingDecl") ^ -1 | |
* SDDecl ^ -1 | |
* SSopt | |
* "?>" | |
-- VersionNum ::= '1.0' | '1.1' | |
VersionNum = P "1.0" | |
+ P "1.1" | |
-- VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') | |
VersionInfo = SS | |
* "version" | |
* Eq | |
* ( ("'" * VersionNum * "'") | |
+ ('"' * VersionNum * '"') | |
) | |
-- Misc ::= Comment | PI | S | |
Misc = Comment + PI + SS | |
-- doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' | |
doctypedecl = "<!DOCTYPE" | |
* SS | |
* Name | |
* (SS * V "ExternalID") ^ -1 | |
* SSopt | |
* ( "[" | |
* V "intSubset" | |
* "]" | |
* SSopt | |
) ^ -1 | |
* ">" | |
-- document ::= ( prolog element Misc* ) | |
document = prolog * V "element" * Misc^0 | |
-- DeclSep ::= PEReference | S | |
DeclSep = PEReference + SS | |
-- choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' | |
-- seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' | |
choice = "(" * SSopt * V "cp" * (SSopt * "|" * SSopt * V "cp")^1 * SSopt * ")" | |
seq = "(" * SSopt * V "cp" * (SSopt * "," * SSopt * V "cp")^0 * SSopt * ")" | |
-- cp ::= (Name | choice | seq) ('?' | '*' | '+')? | |
cp = (Name + choice + seq) * (S "?*+") ^ -1 | |
-- children ::= (choice | seq) ('?' | '*' | '+')? | |
children = (choice + seq) * (S "?*+") ^ -1 | |
-- Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' | |
Mixed = "(" * SSopt * "#PCDATA" * (SSopt * "|" * SSopt * Name)^0 * SSopt * ")*" | |
+ "(" * SSopt * "#PCDATA" * SSopt * ")" | |
-- contentspec ::= 'EMPTY' | 'ANY' | Mixed | children | |
contentspec = P "EMPTY" + P "ANY" + Mixed + children | |
-- elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' | |
elementdecl = "<!ELEMENT" * SS * Name * SS * contentspec * SSopt * ">" | |
-- EnumeratedType ::= NotationType | Enumeration | |
-- NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' | |
-- Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' | |
-- AttType ::= StringType | TokenizedType | EnumeratedType | |
-- StringType ::= 'CDATA' | |
-- TokenizedTyp e ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | |
-- | 'NMTOKEN' | 'NMTOKENS' | |
-- | |
NotationType = "NOTATION" * SS * "(" * SSopt * Name * (SSopt * "|" * SSopt * Name)^0 * SSopt * ")" | |
Enumeration = "(" * SSopt * Nmtoken * (SSopt * "|" * SSopt * Nmtoken)^0 * SSopt * ")" | |
AttType = P "CDATA" | |
+ P "ID" | |
+ P "IDREF" | |
+ P "IDREFS" | |
+ P "ENTITY" | |
+ P "ENTITIES" | |
+ P "NMTOKEN" | |
+ P "NMTOKENS" | |
+ NotationType | |
+ Enumeration | |
-- DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) | |
DefaultDecl = P "#REQUIRED" | |
+ P "#IMPLIED" | |
+ (((P "#FIXED" * SS) ^ -1) * AttValue) | |
-- AttDef ::= S Name S AttType S DefaultDecl | |
AttDef = SS * Name * SS * AttType * SS * DefaultDecl | |
-- AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' | |
AttlistDecl = "<!ATTLIST" * SS * Name * AttDef^0 * SSopt * ">" | |
-- ExternalID ::= 'SYSTEM' S SystemLiteral | |
-- | 'PUBLIC' S PubidLiteral S SystemLiteral | |
-- | |
ExternalID = "SYSTEM" * SS * SystemLiteral | |
+ "PUBLIC" * SS * PubidLiteral * SS * SystemLiteral | |
-- NDataDecl ::= S 'NDATA' S Name | |
NDataDecl = SS * "NDATA" * SS * Name | |
-- EntityDecl ::= GEDecl | PEDecl | |
-- GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' | |
-- PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' | |
-- EntityDef ::= EntityValue | (ExternalID NDataDecl?) | |
-- PEDef ::= EntityValue | ExternalID | |
-- | |
PEDef = EntityValue + ExternalID | |
EntityDef = EntityValue + (ExternalID * NDataDecl ^ -1) | |
GEDecl = "<!ENTITY" * SS * Name * SS * EntityDef * SSopt * ">" | |
PEDecl = "<!ENTITY" * SS * "%" * SS * Name * SS * PEDef * SSopt * ">" | |
EntityDecl = GEDecl + PEDecl | |
-- PublicID ::= 'PUBLIC' S PubidLiteral | |
PublicID = "PUBLIC" * SS * PubidLiteral | |
-- NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' | |
NotationDecl = "<!NOTATION" * SS * Name * SS * (ExternalID + PublicID) * SSopt * ">" | |
-- markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment | |
markupdecl = elementdecl | |
+ AttlistDecl | |
+ EntityDecl | |
+ NotationDecl | |
+ PI | |
+ Comment | |
-- conditionalSect ::= includeSect | ignoreSect | |
-- includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' | |
-- ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' | |
-- ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* | |
-- Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) | |
-- | |
Ignore = (1 - (P "<![" + P "]]>")) | |
ignoreSectContents = Ignore * ("<![" * V "ignoreSectContents" * "]]" * Ignore)^0 | |
conditionalSect = ("<![" * SSopt * "INCLUDE" * SSopt * "[" * V "extSubsetDecl" * "]]>") | |
+ ("<![" * SSopt * "IGNORE" * SSopt * "[" * ignoreSectContents^0 * "]]>") | |
-- intSubset ::= (markupdecl | DeclSep)* | |
intSubset = (markupdecl + DeclSep)^0 | |
-- extSubsetDecl ::= (markupdecl | conditionalSect | DeclSep)* | |
extSubsetDecl = (markupdecl + conditionalSect + DeclSep)^0 | |
-- EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* | |
EncName = R ("AZ", "az") * ((R ("AZ", "az", "09") + S "._") + "-")^0 | |
-- EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) | |
EncodingDecl = SS | |
* "encoding" | |
* Eq | |
* ( '"' * EncName * '"' | |
+ "'" * EncName * "'" | |
) | |
-- TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' | |
TextDecl = "<?xml" | |
* VersionInfo ^ -1 | |
* EncodingDecl | |
* SSopt | |
* "?>" | |
-- extSubset ::= TextDecl? extSubsetDecl | |
extSubset = TextDecl ^ -1 | |
* extSubsetDecl | |
-- Attribute ::= Name Eq AttValue | |
Attribute = Name * Eq * AttValue | |
-- STag ::= '<' Name (S Attribute)* S? '>' | |
STag = "<" * Name * (SS * Attribute)^0 * SSopt * ">" | |
-- ETag ::= '</' Name S? '>' | |
ETag = "</" * Name * SSopt * ">" | |
-- EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' | |
EmptyElemTag = "<" * Name * (SS * Attribute)^0 * SSopt * "/>" | |
-- elementdecl ::= EmptyElemTag | STag content ETag | |
element = EmptyElemTag | |
+ (STag * V "content" * ETag) | |
-- content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* | |
content = CharData ^ -1 | |
* ( ( element | |
+ Reference | |
+ CDSect | |
+ PI | |
+ Comment | |
) | |
* CharData ^ -1 | |
) ^ 0 | |
end -- _ENV = grammar | |
d = P (grammar) / io.write | |
d:match (io.read "*a") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
-- XML parsing and DOM tree representation. | |
-- | |
-- @copyright 2012 Adrian Perez <[email protected]> | |
-- @license Distributed under terms of the MIT license. | |
-- | |
local setmetatable, getmetatable = setmetatable, getmetatable | |
local tpack, tremove, tconcat = table.pack, table.remove, table.concat | |
local print, ipairs, pairs, assert, type = print, ipairs, pairs, assert, type | |
local openfile, tostring = io.open, tostring | |
local sprintf = string.format | |
local lpeg = require "lpeg" | |
local V, R, S, P, C = lpeg.V, lpeg.R, lpeg.S, lpeg.P, lpeg.C | |
local Cg, Ct = lpeg.Cg, lpeg.Ct | |
local _M = { | |
xml_grammar = { "document" } | |
} | |
local Object = {} | |
--- Clones an object. | |
-- | |
-- Clones an object, returning a new one. The returned object will look | |
-- up missing attributes in the table in which `clone()` was called. | |
-- Optionally, a table from which to pick additional attributes can | |
-- be passed (n.b. it is equivalent to call @{Object:extend} on the | |
-- returned object). | |
-- | |
-- @param t Table with additional attributes (optional). | |
-- @return New cloned object. | |
-- | |
function Object:clone (t) | |
local clone = {} | |
setmetatable (clone, { __index = self }) | |
if type (t) == "table" then | |
for k, v in pairs (t) do | |
clone[k] = v | |
end | |
end | |
return clone | |
end | |
--- Alias for @{Object:clone} | |
-- | |
Object.extend = Object.clone | |
--- Gets the prototype of an object. | |
-- | |
-- The prototype is the base object from which the object was cloned. | |
-- @return A table (the prototype) or `nil` (for the base object). | |
-- | |
function Object:prototype () | |
local meta = getmetatable (self) | |
return meta and meta.__index | |
end | |
--- Checks whether an object is derived from some other object. | |
-- | |
-- **Note** that this function will traverse the object prototype | |
-- chain recursively, so it may be slow. | |
-- | |
-- @param obj Reference object. | |
-- @return Whether the object derives from the reference object. | |
-- | |
function Object:derives (obj) | |
local meta = getmetatable (self) | |
while true do | |
-- No metatable, or no __index, means it's the base object | |
if not (meta and meta.__index) then | |
return false | |
end | |
-- Yup, this is derived | |
if meta.__index == obj then | |
return true | |
end | |
-- Climb up in the hierarchy | |
meta = getmetatable (meta.__index) | |
end | |
end | |
_M.Object = Object | |
local flatten_attributes | |
local outline | |
function outline (e, d) | |
d = d or 0 | |
local indent = (" "):rep (d) | |
local attr = flatten_attributes (e.attributes) | |
print (("%s%s [%s] <%s>"):format (indent, e.nodeType, attr, e.tagName or e.nodeValue)) | |
if e:hasChildNodes () then | |
d = d + 1 | |
for i, v in ipairs (e.childNodes) do | |
outline (v, d) | |
end | |
end | |
end | |
--- | |
-- XML DOM node class. | |
-- | |
-- This loosely follows the [HTML DOM element API] | |
-- (https://developer.mozilla.org/en/DOM/element), but deviates from | |
-- it on purpose, to make more convenient to use and more Lua-like. | |
-- | |
-- @type Node | |
-- | |
local Node = Object:clone | |
{ | |
ELEMENT_NODE = "element"; | |
TEXT_NODE = "text"; | |
CDATA_SECTION_NODE = "cdata"; | |
COMMENT_NODE = "comment"; | |
PROCESSING_INSTRUCTION_NODE = "processinginstruction"; | |
DOCTYPE_NODE = "doctype"; | |
XML_DECLARATION_NODE = "xmldeclaration"; | |
DOCUMENT_NODE = "document"; | |
nodeType = nil; | |
nodeValue = nil; | |
tagName = nil; | |
parentNode = nil; | |
attributes = nil; | |
childNodes = nil; | |
-- nodeType == DOCTYPE_NODE | |
doctypePublicIdentifier = nil; | |
doctypeSystemIdentifier = nil; | |
-- nodeType == XML_DECLARATION_NODE | |
xmlVersion = nil; | |
xmlEncoding = nil; | |
xmlStandalone = nil; | |
} | |
--- | |
-- Returns the number of child elements of a node. | |
-- | |
function Node:childElementCount () | |
return self.childNodes and #self.childNodes or 0 | |
end | |
--- | |
-- Appends a child to a node. | |
-- | |
-- @param child Node to be appended. | |
-- | |
function Node:appendChild (child) | |
if self.childNodes == nil then | |
self.childNodes = {} | |
end | |
self.childNodes[#self.childNodes + 1] = child | |
child.parentNode = self | |
end | |
--- | |
-- Removes a child from a node. | |
-- | |
-- @param child Node to be removed. If it is not a child, nothing is done. | |
-- | |
function Node:removeChild (child) | |
if child.parentNode ~= self or self.childNodes == nil then | |
return | |
end | |
local pos = nil | |
if self.childNodes ~= nil then | |
for i, value in ipairs (self.childNodes) do | |
if value == child then | |
pos = i | |
break | |
end | |
end | |
end | |
if pos ~= nil then | |
tremove (self.childNodes, pos) | |
child.parentNode = nil | |
end | |
end | |
--- | |
-- Checks whether a node has children. | |
-- | |
function Node:hasChildNodes () | |
return self.childNodes ~= nil and #self.childNodes > 0 | |
end | |
local escape_replacements = { | |
["&"] = "&"; | |
["<"] = "<"; | |
} | |
local function escape_match (capture) | |
return escape_replacements[capture] or capture | |
end | |
local function escape (text) | |
return text:gsub ("&%a*;?", escape_match) | |
end | |
local flatten_node | |
local function flatten_document (elt, output) | |
for _, child in ipairs (elt.childNodes) do | |
flatten_node (child, output) | |
end | |
end | |
local function flatten_cdata (elt, output) | |
output[#output+1] = "<![CDATA[" | |
output[#output+1] = elt.nodeValue | |
output[#output+1] = "]]>\n" | |
end | |
local function flatten_comment (elt, output) | |
output[#output+1] = "<!-- " .. elt.nodeValue .. " -->" | |
end | |
local function flatten_proc_ins (elt, output) | |
output[#output+1] = "<?" .. elt.tagName .. " " | |
output[#output+1] = elt.nodeValue | |
output[#output+1] = "?>\n" | |
end | |
local function flatten_text (elt, output) | |
output[#output+1] = escape (elt.nodeValue) | |
end | |
local function flatten_xml_decl (elt, output) | |
output[#output+1] = "<?xml version=\"" .. elt.xmlVersion | |
output[#output+1] = "\" encoding=\"" .. elt.xmlEncoding | |
if elt.xmlStandalone ~= nil then | |
output[#output+1] = "\" standalone=\"" | |
output[#output+1] = elt.xmlStandalone and "yes" or "no" | |
end | |
output[#output+1] = "\"?>\n" | |
end | |
local function flatten_doctype (elt, output) | |
output[#output+1] = "<!DOCTYPE " .. elt.tagName | |
if elt.doctypeKind ~= nil then | |
output[#output+1] = elt.doctypeKind .. " \"" | |
if elt.doctypeKind == "SYSTEM" then | |
output[#output+1] = elt.doctypeSystemIdentifier | |
else | |
output[#output+1] = elt.doctypePublicIdentifier | |
output[#output+1] = "\" \"" | |
output[#output+1] = elt.doctypeSystemIdentifier | |
end | |
output[#output+1] = "\"" | |
end | |
output[#output+1] = ">\n" | |
end | |
function flatten_attributes (attributes) | |
local attr = {} | |
if attributes == nil then | |
return "" | |
end | |
for name, value in pairs (attributes) do | |
-- TODO Escaping of characters | |
attr[#attr+1] = name .. "=\"" .. escape (value) .. "\"" | |
end | |
return tconcat (attr, " ") | |
end | |
local function flatten_element (elt, output) | |
output[#output+1] = "<" .. elt.tagName | |
local attr = flatten_attributes (elt.attributes) | |
if #attr > 0 then | |
output[#output+1] = " " .. attr | |
end | |
if elt:hasChildNodes () then | |
output[#output+1] = ">" | |
for _, child in ipairs (elt.childNodes) do | |
flatten_node (child, output) | |
end | |
output[#output+1] = "</" .. elt.tagName .. ">" | |
else | |
output[#output+1] = "/>" | |
end | |
end | |
local flatteners = | |
{ | |
[Node.DOCUMENT_NODE] = flatten_document; | |
[Node.CDATA_SECTION_NODE] = flatten_cdata; | |
[Node.COMMENT_NODE] = flatten_comment; | |
[Node.PROCESSING_INSTRUCTION_NODE] = flatten_proc_ins; | |
[Node.TEXT_NODE] = flatten_text; | |
[Node.XML_DECLARATION_NODE] = flatten_xml_decl; | |
[Node.DOCTYPE_NODE] = flatten_doctype; | |
[Node.ELEMENT_NODE] = flatten_element; | |
} | |
function flatten_node (treeish, output) | |
local flatten_func = assert (flatteners[treeish.nodeType], | |
"Unknown node type: " .. | |
tostring (treeish.nodeType)) | |
flatten_func (treeish, output) | |
end | |
function _M.dump (treeish, output) | |
local result = {} | |
flatten_node (treeish, result) | |
result[#result+1] = "\n" | |
if output == nil then | |
return tconcat (result) | |
else | |
local fd | |
if type (output) == "string" then | |
fd = assert (openfile (output, "wb")) | |
else | |
fd = output | |
end | |
for _, line in ipairs (output) do | |
fd:write (line) | |
end | |
if fd ~= output then | |
fd:close () | |
end | |
end | |
end | |
-- | |
-- The "action" table contains parsing actions. Those functions pick the | |
-- relevant captures defined in the XML grammar and build up a DOM tree | |
-- composed out of "Node" instances. | |
-- | |
local action = {} | |
function action.comment (text) | |
return Node:clone { | |
nodeType = Node.COMMENT_NODE; | |
nodeValue = text; | |
} | |
end | |
function action.processing_instruction (name, content) | |
return Node:clone { | |
nodeType = Node.PROCESSING_INSTRUCTION_NODE; | |
nodeValue = content; | |
tagName = name; | |
} | |
end | |
function action.cdata (content) | |
return Node:clone { | |
nodeType = Node.CDATA_SECTION_NODE; | |
nodeValue = content; | |
} | |
end | |
function action.text (content) | |
return Node:clone { | |
nodeType = Node.TEXT_NODE; | |
nodeValue = content; | |
} | |
end | |
-- The "data" table is optional: | |
-- | |
-- data[1] contains the type of reference: SYSTEM / PUBLIC | |
-- data[2] and data[3] contain the rest of information | |
-- | |
-- The rest of contents of DOCTYPE declarations are completely | |
-- ignored -- yet they are parsed and validated by LPeg. | |
-- | |
function action.doctype (tag, data) | |
local node = Node:clone { | |
nodeType = Node.DOCTYPE_NODE; | |
tagName = tag; | |
} | |
if data then | |
assert (type (data) == "table", | |
"argument #2 to action.doctype is not a table") | |
node.doctypeKind = data[1] | |
if node.doctypeKind == "SYSTEM" then | |
node.doctypeSystemIdentifier = data[2] | |
else | |
assert (node.doctypeKind == "PUBLIC", | |
"DOCTYPE kind has to be either PUBLIC or SYSTEM") | |
node.doctypePublicIdentifier = data[2] | |
node.doctypeSystemIdentifier = data[3] | |
end | |
end | |
return node | |
end | |
function action.xml_declaration (version, encoding, standalone) | |
return Node:clone { | |
nodeType = Node.XML_DECLARATION_NODE; | |
xmlStandalone = standalone ~= "" and standalone or nil; | |
xmlEncoding = encoding ~= "" and encoding or "utf-8"; | |
xmlVersion = version; | |
} | |
end | |
function action.document (prolog, rootnode, epilog) | |
local doc = Node:clone { nodeType = Node.DOCUMENT_NODE } | |
for i, node in ipairs (prolog) do | |
doc:appendChild (node) | |
end | |
doc:appendChild (rootnode) | |
for i, node in ipairs (epilog) do | |
doc:appendChild (node) | |
end | |
return doc | |
end | |
function action.empty_element_tag (tag, ...) | |
local arg = tpack (...) | |
local att = {} | |
for i = 1, arg.n, 2 do | |
att[arg[i]] = arg[i+1] | |
end | |
return Node:clone { | |
nodeType = Node.ELEMENT_NODE; | |
tagName = tag; | |
attributes = att; | |
} | |
end | |
function action.element_tag (stag, arg, ...) | |
local child = tpack (...) | |
assert (stag == child[child.n], | |
"Start tag does not match end tag ("..stag..", "..child[child.n]..")") | |
tremove (child) | |
local att = {} | |
for i = 1, arg.n or #arg, 2 do | |
att[arg[i]] = arg[i+1] | |
end | |
local elt = Node:clone { | |
nodeType = Node.ELEMENT_NODE; | |
tagName = stag; | |
attributes = att; | |
} | |
for i, node in ipairs (child) do | |
elt:appendChild (node) | |
end | |
return elt | |
end | |
local entity_refs = { | |
["<"] = "<"; | |
["&"] = "&"; | |
} | |
function action.entity_reference (name) | |
return entity_refs[name] or name | |
end | |
function action.captures (...) | |
local args = { ... } | |
local FMT = "[%i] [1;1m>[0;0m%s[1;1m<[0;0m" | |
local FN = "<%s[%s]>" | |
for i, v in ipairs (args) do | |
if type (v) == "table" then | |
if v.nodeType then | |
v = FN:format (v.nodeType, v.tagName) | |
else | |
v = lib.ml.tstring (v) | |
end | |
end | |
print (FMT:format (i, v)) | |
end | |
return { ... } | |
end | |
-- | |
-- XML grammar using LPeg. It uses the functions in the "action" table to | |
-- build a DOM tree. | |
-- | |
do local _ENV = _M.xml_grammar | |
-- S ::= (#x20 | #x9 | #xD | #xA)+ | |
SS = (S " \t\r\n")^1 | |
SSopt = SS ^ -1 | |
-- NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | |
NameStartChar = S ":_" + R ("az", "AZ") | |
-- NameChar ::= NameStartChar | "-" | "." | [0-9] | |
NameChar = NameStartChar + S "-." + R "09" | |
-- Name ::= NameStartChar (NameChar)* | |
Name = NameStartChar * NameChar^0 | |
-- Names ::= Name (#x20 Name)* | |
Names = Name * (" " * Name)^0 | |
-- Nmtoken ::= (NameChar)+ | |
Nmtoken = NameChar^1 | |
-- Nmtokens ::= Nmtoken (#x20 Nmtoken)* | |
Nmtokens = Nmtoken * (" " * Nmtoken)^0 | |
-- CharRef ::= '&#' [0-9]+ ';' | |
-- | '&#x' [0-9a-fA-F]+ ';' | |
CharRef = ("&#" * (R "09")^1 * ";") | |
+ ("&#x" * (R ("09", "af", "AF"))^1 * ";") | |
-- EntityRef ::= '&' Name ';' | |
--EntityRef = "&" * C( Name ) * ";" | |
-- / action.entity_reference | |
EntityRef = "&" * Name * ";" | |
/ action.entity_reference | |
-- PEReference ::= '%' Name ';' | |
PEReference = "%" * Name * ";" | |
-- Reference ::= EntityRef | CharRef | |
Reference = EntityRef + CharRef | |
-- EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | |
-- | "'" ([^%&'] | PEReference | Reference)* "'" | |
EntityValue = ('"' * ((1 - S '%&"') + PEReference + Reference)^0 * '"') | |
+ ("'" * ((1 - S "%&'") + PEReference + Reference)^0 * "'") | |
-- AttValue ::= '"' ([^<&"] | Reference)* '"' | |
-- | "'" ([^<&'] | Reference)* "'" | |
AttValue = ( ('"' * Ct( (C(1 - S '<&"')^1 + Reference)^0 ) * '"') | |
+ ("'" * Ct( (C(1 - S "<&'")^1 + Reference)^0 ) * "'") | |
) / tconcat | |
-- SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") | |
SystemLiteral = ('"' * C( (1 - P '"')^0 ) * '"') | |
+ ("'" * C( (1 - P "'")^0 ) * "'") | |
-- PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] | |
PubidChar = S " \r\n-'()+,./:=?;!*#@$_%" + R ("az", "AZ", "09") | |
-- PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" | |
PubidLiteral = ('"' * C( PubidChar^0 ) * '"') | |
+ ("'" * C( (PubidChar - "'")^0 ) * "'") | |
-- CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) | |
CharData = C( (1 - (S "<&" + "]]>"))^1 ) | |
-- Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' | |
Comment = "<!--" | |
* C( ((1 - S "-") + ("-" * (1 - S "-")))^0 ) | |
* "-->" | |
/ action.comment | |
-- PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) | |
PITarget = Name - (S "xX" * S "mM" * S "lL") | |
-- PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' | |
PI = "<?" | |
* C( PITarget ) | |
* (SS * C( (1 - P "?>")^1 )^0 ) | |
* "?>" | |
/ action.processing_instruction | |
-- CDSect ::= CDStart CData CDEnd | |
-- CDStart ::= '<![CDATA[' | |
-- CData ::= (Char* - (Char* ']]>' Char*)) | |
-- CDEnd ::= ']]>' | |
CData = (1 - P "]]>")^0 | |
CDSect = "<![CDATA[" | |
* C( CData ) | |
* "]]>" | |
/ action.cdata | |
-- prolog ::= XMLDecl Misc* (doctypedecl Misc*)? | |
prolog = (V "XMLDecl") ^ -1 | |
* (V "Misc")^0 | |
* (V "doctypedecl" * (V "Misc")^0) ^ -1 | |
-- Eq ::= S? '=' S? | |
Eq = SSopt * "=" * SSopt | |
-- SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) | |
SDDecl = SS | |
* "standalone" | |
* Eq | |
* ( ("'" * C(P "yes" + "no") * "'") | |
+ ('"' * C(P "yes" + "no") * '"') | |
) | |
/ function (v) return v == "yes" end | |
-- VersionNum ::= '1.0' | '1.1' | |
VersionNum = P "1.0" | |
+ P "1.1" | |
-- VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') | |
VersionInfo = SS | |
* "version" | |
* Eq | |
* ( ("'" * C( VersionNum ) * "'") | |
+ ('"' * C( VersionNum ) * '"') | |
) | |
-- XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' | |
XMLDecl = "<?xml" | |
* Cg( VersionInfo ) | |
* Cg( (V "EncodingDecl") ^ -1 ) | |
* Cg( SDDecl ^ -1 ) | |
* SSopt | |
* "?>" | |
/ action.xml_declaration | |
-- Misc ::= Comment | PI | S | |
Misc = Comment + PI + SS | |
-- doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' | |
doctypedecl = "<!DOCTYPE" | |
* SS | |
* C( Name ) | |
* (SS * V "ExternalID") ^ -1 | |
* SSopt | |
* ( "[" | |
* V "intSubset" | |
* "]" | |
* SSopt | |
) ^ -1 | |
* ">" | |
/ action.doctype | |
-- document ::= ( prolog element Misc* ) | |
document = Ct( prolog ) | |
* Cg( V "element" ) | |
* Ct( Misc^0 ) | |
/ action.document | |
-- DeclSep ::= PEReference | S | |
DeclSep = PEReference + SS | |
-- choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' | |
-- seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' | |
choice = "(" * SSopt * V "cp" * (SSopt * "|" * SSopt * V "cp")^1 * SSopt * ")" | |
seq = "(" * SSopt * V "cp" * (SSopt * "," * SSopt * V "cp")^0 * SSopt * ")" | |
-- cp ::= (Name | choice | seq) ('?' | '*' | '+')? | |
cp = (Name + choice + seq) * (S "?*+") ^ -1 | |
-- children ::= (choice | seq) ('?' | '*' | '+')? | |
children = (choice + seq) * (S "?*+") ^ -1 | |
-- Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | '(' S? '#PCDATA' S? ')' | |
Mixed = "(" * SSopt * "#PCDATA" * (SSopt * "|" * SSopt * Name)^0 * SSopt * ")*" | |
+ "(" * SSopt * "#PCDATA" * SSopt * ")" | |
-- contentspec ::= 'EMPTY' | 'ANY' | Mixed | children | |
contentspec = P "EMPTY" + P "ANY" + Mixed + children | |
-- elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' | |
elementdecl = "<!ELEMENT" * SS * Name * SS * contentspec * SSopt * ">" | |
-- EnumeratedType ::= NotationType | Enumeration | |
-- NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' | |
-- Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' | |
-- AttType ::= StringType | TokenizedType | EnumeratedType | |
-- StringType ::= 'CDATA' | |
-- TokenizedTyp e ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | |
-- | 'NMTOKEN' | 'NMTOKENS' | |
-- | |
NotationType = "NOTATION" * SS * "(" * SSopt * Name * (SSopt * "|" * SSopt * Name)^0 * SSopt * ")" | |
Enumeration = "(" * SSopt * Nmtoken * (SSopt * "|" * SSopt * Nmtoken)^0 * SSopt * ")" | |
AttType = P "CDATA" | |
+ P "ID" | |
+ P "IDREF" | |
+ P "IDREFS" | |
+ P "ENTITY" | |
+ P "ENTITIES" | |
+ P "NMTOKEN" | |
+ P "NMTOKENS" | |
+ NotationType | |
+ Enumeration | |
-- DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) | |
DefaultDecl = P "#REQUIRED" | |
+ P "#IMPLIED" | |
+ (((P "#FIXED" * SS) ^ -1) * AttValue) | |
-- AttDef ::= S Name S AttType S DefaultDecl | |
AttDef = SS * Name * SS * AttType * SS * DefaultDecl | |
-- AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' | |
AttlistDecl = "<!ATTLIST" * SS * Name * AttDef^0 * SSopt * ">" | |
-- ExternalID ::= 'SYSTEM' S SystemLiteral | |
-- | 'PUBLIC' S PubidLiteral S SystemLiteral | |
-- | |
ExternalID = C( P "SYSTEM" ) * SS * SystemLiteral | |
+ C( P "PUBLIC" ) * SS * PubidLiteral * SS * SystemLiteral | |
/ tpack | |
-- NDataDecl ::= S 'NDATA' S Name | |
NDataDecl = SS * "NDATA" * SS * Name | |
-- EntityDecl ::= GEDecl | PEDecl | |
-- GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' | |
-- PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' | |
-- EntityDef ::= EntityValue | (ExternalID NDataDecl?) | |
-- PEDef ::= EntityValue | ExternalID | |
-- | |
PEDef = EntityValue + ExternalID | |
EntityDef = EntityValue + (ExternalID * NDataDecl ^ -1) | |
GEDecl = "<!ENTITY" * SS * Name * SS * EntityDef * SSopt * ">" | |
PEDecl = "<!ENTITY" * SS * "%" * SS * Name * SS * PEDef * SSopt * ">" | |
EntityDecl = GEDecl + PEDecl | |
-- PublicID ::= 'PUBLIC' S PubidLiteral | |
PublicID = "PUBLIC" * SS * PubidLiteral | |
-- NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' | |
NotationDecl = "<!NOTATION" * SS * Name * SS * (ExternalID + PublicID) * SSopt * ">" | |
-- markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment | |
markupdecl = elementdecl | |
+ AttlistDecl | |
+ EntityDecl | |
+ NotationDecl | |
+ PI | |
+ Comment | |
-- conditionalSect ::= includeSect | ignoreSect | |
-- includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' | |
-- ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' | |
-- ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* | |
-- Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) | |
-- | |
Ignore = (1 - (P "<![" + P "]]>")) | |
ignoreSectContents = Ignore * ("<![" * V "ignoreSectContents" * "]]" * Ignore)^0 | |
conditionalSect = ("<![" * SSopt * "INCLUDE" * SSopt * "[" * V "extSubsetDecl" * "]]>") | |
+ ("<![" * SSopt * "IGNORE" * SSopt * "[" * ignoreSectContents^0 * "]]>") | |
-- intSubset ::= (markupdecl | DeclSep)* | |
intSubset = (markupdecl + DeclSep)^0 | |
-- extSubsetDecl ::= (markupdecl | conditionalSect | DeclSep)* | |
extSubsetDecl = (markupdecl + conditionalSect + DeclSep)^0 | |
-- EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* | |
EncName = R ("AZ", "az") * ((R ("AZ", "az", "09") + S "._") + "-")^0 | |
-- EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) | |
EncodingDecl = SS | |
* "encoding" | |
* Eq | |
* ( '"' * C( EncName ) * '"' | |
+ "'" * C( EncName ) * "'" | |
) | |
-- TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' | |
TextDecl = "<?xml" | |
* VersionInfo ^ -1 | |
* EncodingDecl | |
* SSopt | |
* "?>" | |
-- extSubset ::= TextDecl? extSubsetDecl | |
extSubset = TextDecl ^ -1 | |
* extSubsetDecl | |
-- Attribute ::= Name Eq AttValue | |
Attribute = C( Name ) | |
* Eq | |
* Cg( AttValue ) | |
-- STag ::= '<' Name (S Attribute)* S? '>' | |
STag = "<" | |
* C( Name ) | |
* Ct( (SS * Attribute)^0 ) | |
* SSopt | |
* ">" | |
-- ETag ::= '</' Name S? '>' | |
ETag = "</" | |
* C( Name ) | |
* SSopt | |
* ">" | |
-- EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' | |
EmptyElemTag = "<" | |
* C( Name ) | |
* Cg( (SS * Attribute)^0 ) | |
* SSopt | |
* "/>" | |
/ action.empty_element_tag | |
ElemTag = STag | |
* Cg( V "content" ) | |
* ETag | |
/ action.element_tag | |
-- elementdecl ::= EmptyElemTag | STag content ETag | |
element = ElemTag + EmptyElemTag | |
textContent = (Ct( (Reference + CharData)^1 ) / tconcat) | |
/ action.text | |
-- content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* | |
content = textContent ^ -1 | |
* ( ( element | |
+ CDSect | |
+ PI | |
+ Comment | |
) | |
* textContent ^ -1 | |
) ^ 0 | |
end -- _ENV = _M.grammar | |
--- | |
-- Parsing and dumping. | |
-- @section | |
-- | |
--- | |
-- Generates an outline out of a DOM tree. The outline is written | |
-- to the standard output stream | |
-- | |
-- @param treeish DOM tree or subtree (a @{Node}). | |
-- @function outline | |
-- | |
_M.outline = outline | |
_M.xml_peg = P (_M.xml_grammar) | |
_M.Node = Node | |
--- | |
-- Escapes certain characters using entity references. | |
-- | |
-- @param text Text to be escaped. | |
-- @function escape | |
-- | |
_M.escape = escape | |
--- | |
-- Parses XML input into a DOM tree. | |
-- | |
-- @param input Input XML string. | |
-- @return A DOM tree @{Node}. | |
-- | |
function _M.parse (input) | |
return _M.xml_peg:match (input) | |
end | |
function _M.P (prodname) | |
local old = _M.xml_grammar[1] | |
_M.xml_grammar[1] = prodname | |
local peg = P (_M.xml_grammar) | |
_M.xml_grammar[1] = old | |
return peg | |
end | |
return _M |
Added the xmldom.lua
file, which is a small DOM parser made using the grammar from lpegxml.lua
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have run some quick (and not very formal) tests, and it looks like this LPeg-based parser is reasonably fast. I have run
xmllint
(uses libxml2),xmlwf
(uses expat) and this on my GitHub Atom feed. This is running Lua 5.2 built with-O3
on a Core i5 64-bit GNU/Linux system.For
xmllint
, we get:For
xmlwf
, I was expecting it to be faster thanxmllint
but they behave similarly:And finally, for the LPeg-based parser:
Provided that the LPeg-based parser is an almost-direct (and unoptimized) translation of the grammar from the W3C XML specification, the result is fairly good 🤘