Created
January 11, 2023 01:13
-
-
Save thatrandomperson5/482fb4562b72e46900cf736f72759643 to your computer and use it in GitHub Desktop.
Pure-Nim hand-written xml parser, using only stdlib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import parser | |
const xml = """ | |
<MyTag> | |
Hello | |
<OtherTag attr1="hello" attr2="nono"></OtherTag> | |
<Container> | |
I'm shallow | |
<Deep>I'm down deep!</Deep> | |
I'm shallow | |
</Container> | |
</MyTag> | |
""" | |
let p = xml.makeParser | |
let res = p.parse | |
echo res | |
echo res.repr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import std/[sequtils, algorithm, strutils, strformat, strtabs] | |
type Parser = object | |
stack: seq[char] | |
type | |
XmlTag = ref object | |
case isText: bool | |
of false: | |
name*: string | |
children*: seq[XmlTag] | |
attrs*: StringTableRef | |
of true: | |
text*: string | |
ParseResult = object | |
case didWork: bool | |
of false: | |
processed: seq[char] | |
of true: | |
s: string | |
UnPairedKind {.pure.} = enum Start, End, Text | |
UnPaired = object | |
kind: UnPairedKind | |
s: string | |
depth: int | |
proc makeParser*(xml: string): Parser = | |
result = Parser(stack: xml.strip().toSeq.reversed()) | |
proc parseHead(head: string): (string, StringTableRef) = | |
var tag = "" | |
var stack = head.toSeq.reversed() | |
var last = stack.pop | |
while last != ' ': | |
tag &= last | |
last = stack.pop | |
if stack.len == 0: | |
tag &= last | |
return (tag, newStringTable()) | |
var tbl = newStringTable() | |
var formedName = "" | |
var formedAttr = "" | |
while stack.len > 0: | |
last = stack.pop | |
if last != '=': | |
formedName &= last | |
else: | |
doAssert stack.pop == '"' | |
last = stack.pop | |
while last != '"': | |
formedAttr &= last | |
last = stack.pop | |
tbl[formedName] = formedAttr | |
formedName = "" | |
formedAttr = "" | |
if stack.len > 0: | |
doAssert stack.pop == ' ' | |
result = (tag, tbl) | |
proc parseStartTag(s: var seq[char]): ParseResult = | |
var last = s.pop | |
var str = "" | |
var processed = newSeq[char]() | |
var didWork = true | |
if last != '<': | |
didWork = false | |
processed.add last | |
else: | |
while last != '>' and s.len != 0: | |
processed.add last | |
str &= last | |
last = s.pop | |
if s.len == 0: | |
didWork = false | |
break | |
result = ParseResult(didWork: didWork) | |
if result.didWork: | |
str = str[1..^1] | |
result.s = str | |
else: | |
result.processed = processed | |
proc parseEndTag(s: var seq[char]): ParseResult = | |
var last = s.pop | |
var str = "" | |
var processed = newSeq[char]() | |
var didWork = true | |
if s.len == 0: | |
return ParseResult(didWork: false, processed: @[last]) | |
if last & s[^1] != "</": | |
didWork = false | |
processed.add last | |
else: | |
processed.add s.pop | |
while last != '>' and s.len != 0: | |
processed.add last | |
str &= last | |
last = s.pop | |
if s.len == 0 and last != '>': | |
didWork = false | |
break | |
result = ParseResult(didWork: didWork) | |
if result.didWork: | |
str = str[1..^1] | |
result.s = str | |
else: | |
result.processed = processed | |
proc parseText(s: var seq[char]): string = | |
if s[^1] == '<': | |
return "" | |
var last = s.pop | |
while last != '<': | |
result &= last | |
last = s.pop | |
s.add last | |
proc parseInternal(stack: var seq[char], depth: var int): seq[UnPaired] = | |
let et = stack.parseEndTag() | |
if et.didWork: | |
result.add UnPaired(s: et.s, kind: End, depth: depth) | |
depth -= 1 | |
else: | |
stack.add et.processed | |
let st = stack.parseStartTag() | |
if st.didWork: | |
depth += 1 | |
result.add UnPaired(s: st.s, kind: Start, depth: depth) | |
else: | |
stack.add st.processed | |
result.add UnPaired(s: stack.parseText(), kind: Text, depth: depth) | |
if stack.len != 0: | |
result.add stack.parseInternal(depth) | |
proc resolve(pup: seq[UnPaired]): XmlTag = | |
let toMatch = pup[0] | |
let up = pup[1..^2] | |
let head = toMatch.s.parseHead() | |
result = XmlTag(isText: false, name: head[0], attrs: head[1]) | |
var startPos: int | |
let properDepth = toMatch.depth + 1 | |
for i, item in up: | |
case item.kind: | |
of Start: | |
if item.depth == properDepth: | |
startPos = i | |
of Text: | |
if item.depth == toMatch.depth: | |
result.children.add XmlTag(isText: true, text: item.s) | |
of End: | |
if item.depth == properDepth: | |
result.children.add up[startPos..i].resolve() | |
proc parse*(p: Parser): XmlTag = | |
var s = p.stack | |
var depth = 0 | |
let res = s.parseInternal(depth) | |
doAssert s.len == 0 | |
doAssert depth == 0 | |
return res.resolve() | |
proc `$`*(xml: XmlTag): string = | |
if xml.isText: | |
result = xml.text | |
else: | |
var attrs = "" | |
for key, value in xml.attrs: | |
attrs &= &" {key}=\"{value}\"" | |
result = fmt"<{xml.name}{attrs}>" | |
for tag in xml.children: | |
result &= $tag | |
result &= fmt"</{xml.name}>" | |
proc reprCore(xml: XmlTag, depth: var int, i: int): string = | |
var indent = "" | |
depth += 1 | |
for _ in 0..(depth*i): | |
indent &= " " | |
indent = indent[1..^1] | |
if xml.isText: | |
let txt = xml.text.replace("\n", "\\n").replace("\r", "\\r") | |
result = &"{indent}Text: \"{txt}\"\n" | |
else: | |
result = &"{indent}Tag: {xml.name} {$(xml.attrs)}\n" | |
for tag in xml.children: | |
result &= tag.reprCore(depth, i) | |
depth -= 1 | |
proc repr*(xml: XmlTag, indent = 2): string = | |
var d = -1 | |
return xml.reprCore(d, indent) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment