Created
May 1, 2020 10:29
-
-
Save Snektron/b5ed260962bf9b203cd04aa4175430bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const mem = std.mem; | |
const testing = std.testing; | |
const Allocator = mem.Allocator; | |
const ArenaAllocator = std.heap.ArenaAllocator; | |
const SegmentedList = std.SegmentedList; | |
pub const Attribute = struct { | |
name: []const u8, | |
value: []const u8 | |
}; | |
pub const Content = union(enum) { | |
CharData: []const u8, | |
Comment: []const u8, | |
Element: *Element | |
}; | |
pub const Element = struct { | |
const AttributeList = SegmentedList(*Attribute, 0); | |
const ContentList = SegmentedList(Content, 0); | |
tag: []const u8, | |
attributes: AttributeList, | |
children: ContentList, | |
fn init(tag: []const u8, alloc: *Allocator) Element { | |
return .{ | |
.tag = tag, | |
.attributes = AttributeList.init(alloc), | |
.children = ContentList.init(alloc), | |
}; | |
} | |
fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 { | |
var it = self.attributes.iterator(0); | |
while (it.next()) |child| { | |
if (mem.eql(u8, child.*.name, attrib_name)) { | |
return child.*.value; | |
} | |
} | |
return null; | |
} | |
fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 { | |
const child = self.findChildByTag(child_tag) orelse return null; | |
if (child.children.count() != 1) { | |
return null; | |
} | |
return switch (child.children.at(0).*) { | |
.CharData => |char_data| char_data, | |
else => null | |
}; | |
} | |
fn findChildByTag(self: *Element, tag: []const u8) ?*Element { | |
return self.findChildrenByTag(tag).next(); | |
} | |
fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator { | |
return .{ | |
.inner = self.children.iterator(0), | |
.tag = tag | |
}; | |
} | |
pub const FindChildrenByTagIterator = struct { | |
inner: ContentList.Iterator, | |
tag: []const u8, | |
fn next(self: *FindChildrenByTagIterator) ?*Element { | |
while (self.inner.next()) |child| { | |
if (child.* != .Element or !mem.eql(u8, child.*.Element.tag, self.tag)) { | |
continue; | |
} | |
return child.*.Element; | |
} | |
return null; | |
} | |
}; | |
}; | |
pub const XmlDecl = struct { | |
version: []const u8, | |
encoding: ?[]const u8, | |
standalone: ?bool | |
}; | |
pub const Document = struct { | |
arena: ArenaAllocator, | |
xml_decl: ?*XmlDecl, | |
root: *Element, | |
pub fn deinit(self: Document) void { | |
var arena = self.arena; // Copy to stack so self can be taken by value. | |
arena.deinit(); | |
} | |
}; | |
const ParseContext = struct { | |
source: []const u8, | |
offset: usize, | |
line: usize, | |
column: usize, | |
fn init(source: []const u8) ParseContext { | |
return .{ | |
.source = source, | |
.offset = 0, | |
.line = 0, | |
.column = 0 | |
}; | |
} | |
fn peek(self: *ParseContext) ?u8 { | |
return if (self.offset < self.source.len) self.source[self.offset] else null; | |
} | |
fn consume(self: *ParseContext) !u8 { | |
if (self.offset < self.source.len) { | |
return self.consumeNoEof(); | |
} | |
return error.UnexpectedEof; | |
} | |
fn consumeNoEof(self: *ParseContext) u8 { | |
std.debug.assert(self.offset < self.source.len); | |
const c = self.source[self.offset]; | |
self.offset += 1; | |
if (c == '\n') { | |
self.line += 1; | |
self.column = 0; | |
} else { | |
self.column += 1; | |
} | |
return c; | |
} | |
fn eat(self: *ParseContext, char: u8) bool { | |
self.expect(char) catch return false; | |
return true; | |
} | |
fn expect(self: *ParseContext, expected: u8) !void { | |
if (self.peek()) |actual| { | |
if (expected != actual) { | |
return error.UnexpectedCharacter; | |
} | |
_ = self.consumeNoEof(); | |
return; | |
} | |
return error.UnexpectedEof; | |
} | |
fn eatStr(self: *ParseContext, text: []const u8) bool { | |
self.expectStr(text) catch return false; | |
return true; | |
} | |
fn expectStr(self: *ParseContext, text: []const u8) !void { | |
if (self.source.len < self.offset + text.len) { | |
return error.UnexpectedEof; | |
} else if (std.mem.startsWith(u8, self.source[self.offset ..], text)) { | |
var i: usize = 0; | |
while (i < text.len) : (i += 1) { | |
_ = self.consumeNoEof(); | |
} | |
return; | |
} | |
return error.UnexpectedCharacter; | |
} | |
fn eatWs(self: *ParseContext) bool { | |
var ws = false; | |
while (self.peek()) |ch| { | |
switch (ch) { | |
' ', '\t', '\n', '\r' => { | |
ws = true; | |
_ = self.consumeNoEof(); | |
}, | |
else => break | |
} | |
} | |
return ws; | |
} | |
fn expectWs(self: *ParseContext) !void { | |
if (!self.eatWs()) return error.UnexpectedCharacter; | |
} | |
fn currentLine(self: ParseContext) []const u8 { | |
var begin: usize = 0; | |
if (mem.indexOfScalarPos(u8, self.source[0 .. self.offset], '\n')) |prev_nl| { | |
begin = prev_nl + 1; | |
} | |
var end = mem.indexOfScalarPos(u8, self.source, self.offset, '\n') orelse self.source.len; | |
return self.source[begin .. end]; | |
} | |
}; | |
test "ParseContext" { | |
{ | |
var ctx = ParseContext.init("I like pythons"); | |
testing.expectEqual(@as(?u8, 'I'), ctx.peek()); | |
testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); | |
testing.expectEqual(@as(?u8, ' '), ctx.peek()); | |
testing.expectEqual(@as(u8, ' '), try ctx.consume()); | |
testing.expect(ctx.eat('l')); | |
testing.expectEqual(@as(?u8, 'i'), ctx.peek()); | |
testing.expectEqual(false, ctx.eat('a')); | |
testing.expectEqual(@as(?u8, 'i'), ctx.peek()); | |
try ctx.expect('i'); | |
testing.expectEqual(@as(?u8, 'k'), ctx.peek()); | |
testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); | |
testing.expectEqual(@as(?u8, 'k'), ctx.peek()); | |
testing.expect(ctx.eatStr("ke")); | |
testing.expectEqual(@as(?u8, ' '), ctx.peek()); | |
testing.expect(ctx.eatWs()); | |
testing.expectEqual(@as(?u8, 'p'), ctx.peek()); | |
testing.expectEqual(false, ctx.eatWs()); | |
testing.expectEqual(@as(?u8, 'p'), ctx.peek()); | |
testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); | |
testing.expectEqual(@as(?u8, 'p'), ctx.peek()); | |
testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); | |
testing.expectEqual(@as(?u8, 'p'), ctx.peek()); | |
testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); | |
testing.expectEqual(@as(?u8, 'p'), ctx.peek()); | |
try ctx.expectStr("python"); | |
testing.expectEqual(@as(?u8, 's'), ctx.peek()); | |
} | |
{ | |
var ctx = ParseContext.init(""); | |
testing.expectEqual(ctx.peek(), null); | |
testing.expectError(error.UnexpectedEof, ctx.consume()); | |
testing.expectEqual(ctx.eat('p'), false); | |
testing.expectError(error.UnexpectedEof, ctx.expect('p')); | |
} | |
} | |
pub const ParseError = error { | |
IllegalCharacter, | |
UnexpectedEof, | |
UnexpectedCharacter, | |
UnclosedValue, | |
UnclosedComment, | |
InvalidName, | |
InvalidEntity, | |
InvalidStandaloneValue, | |
NonMatchingClosingTag, | |
InvalidDocument, | |
OutOfMemory | |
}; | |
pub fn parse(backing_allocator: *Allocator, source: []const u8) !Document { | |
var ctx = ParseContext.init(source); | |
return try parseDocument(&ctx, backing_allocator); | |
} | |
fn parseDocument(ctx: *ParseContext, backing_allocator: *Allocator) !Document { | |
var doc = Document{ | |
.arena = ArenaAllocator.init(backing_allocator), | |
.xml_decl = null, | |
.root = undefined | |
}; | |
errdefer doc.deinit(); | |
doc.xml_decl = try tryParseProlog(ctx, &doc.arena.allocator); | |
_ = ctx.eatWs(); | |
doc.root = (try tryParseElement(ctx, &doc.arena.allocator)) orelse return error.InvalidDocument; | |
_ = ctx.eatWs(); | |
if (ctx.peek() != null) return error.InvalidDocument; | |
return doc; | |
} | |
fn parseAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { | |
const quote = try ctx.consume(); | |
if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; | |
const begin = ctx.offset; | |
while (true) { | |
const c = ctx.consume() catch return error.UnclosedValue; | |
if (c == quote) break; | |
} | |
const end = ctx.offset - 1; | |
return try dupeAndUnescape(alloc, ctx.source[begin .. end]); | |
} | |
fn parseEqAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { | |
_ = ctx.eatWs(); | |
try ctx.expect('='); | |
_ = ctx.eatWs(); | |
return try parseAttrValue(ctx, alloc); | |
} | |
fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { | |
// XML's spec on names is very long, so to make this easier | |
// we just take any character that is not special and not whitespace | |
const begin = ctx.offset; | |
while (ctx.peek()) |ch| { | |
switch (ch) { | |
' ', '\t', '\n', '\r' => break, | |
'&', '"', '\'', '<', '>', '?', '=', '/' => break, | |
else => _ = ctx.consumeNoEof() | |
} | |
} | |
const end = ctx.offset; | |
if (begin == end) return error.InvalidName; | |
return ctx.source[begin .. end]; | |
} | |
fn tryParseCharData(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { | |
const begin = ctx.offset; | |
while (ctx.peek()) |ch| { | |
switch (ch) { | |
'<', '>' => break, | |
else => _ = ctx.consumeNoEof() | |
} | |
} | |
const end = ctx.offset; | |
if (begin == end) return null; | |
return try dupeAndUnescape(alloc, ctx.source[begin .. end]); | |
} | |
fn parseContent(ctx: *ParseContext, alloc: *Allocator) ParseError!Content { | |
if (try tryParseCharData(ctx, alloc)) |cd| { | |
return Content{.CharData = cd}; | |
} else if (try tryParseComment(ctx, alloc)) |comment| { | |
return Content{.Comment = comment}; | |
} else if (try tryParseElement(ctx, alloc)) |elem| { | |
return Content{.Element = elem}; | |
} else { | |
return error.UnexpectedCharacter; | |
} | |
} | |
fn tryParseAttr(ctx: *ParseContext, alloc: *Allocator) !?*Attribute { | |
const name = parseNameNoDupe(ctx) catch return null; | |
_ = ctx.eatWs(); | |
try ctx.expect('='); | |
_ = ctx.eatWs(); | |
const value = try parseAttrValue(ctx, alloc); | |
const attr = try alloc.create(Attribute); | |
attr.name = try mem.dupe(alloc, u8, name); | |
attr.value = value; | |
return attr; | |
} | |
fn tryParseElement(ctx: *ParseContext, alloc: *Allocator) !?*Element { | |
const start = ctx.offset; | |
if (!ctx.eat('<')) return null; | |
const tag = parseNameNoDupe(ctx) catch { | |
ctx.offset = start; | |
return null; | |
}; | |
const element = try alloc.create(Element); | |
element.* = Element.init(try std.mem.dupe(alloc, u8, tag), alloc); | |
while (ctx.eatWs()) { | |
const attr = (try tryParseAttr(ctx, alloc)) orelse break; | |
try element.attributes.push(attr); | |
} | |
if (ctx.eatStr("/>")) { | |
return element; | |
} | |
try ctx.expect('>'); | |
while (true) { | |
if (ctx.peek() == null) { | |
return error.UnexpectedEof; | |
} else if (ctx.eatStr("</")) { | |
break; | |
} | |
const content = try parseContent(ctx, alloc); | |
try element.children.push(content); | |
} | |
const closing_tag = try parseNameNoDupe(ctx); | |
if (!std.mem.eql(u8, tag, closing_tag)) { | |
return error.NonMatchingClosingTag; | |
} | |
_ = ctx.eatWs(); | |
try ctx.expect('>'); | |
return element; | |
} | |
test "tryParseElement" { | |
{ | |
var ctx = ParseContext.init("<= a='b'/>"); | |
testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, std.debug.global_allocator)); | |
testing.expectEqual(@as(?u8, '<'), ctx.peek()); | |
} | |
{ | |
var ctx = ParseContext.init("<python size='15' color = \"green\"/>"); | |
const elem = try tryParseElement(&ctx, std.debug.global_allocator); | |
testing.expectEqualSlices(u8, elem.?.tag, "python"); | |
const size_attr = elem.?.attributes.at(0).*; | |
testing.expectEqualSlices(u8, size_attr.name, "size"); | |
testing.expectEqualSlices(u8, size_attr.value, "15"); | |
const color_attr = elem.?.attributes.at(1).*; | |
testing.expectEqualSlices(u8, color_attr.name, "color"); | |
testing.expectEqualSlices(u8, color_attr.value, "green"); | |
} | |
{ | |
var ctx = ParseContext.init("<python>test</python>"); | |
const elem = try tryParseElement(&ctx, std.debug.global_allocator); | |
testing.expectEqualSlices(u8, elem.?.tag, "python"); | |
testing.expectEqualSlices(u8, elem.?.children.at(0).CharData, "test"); | |
} | |
{ | |
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>"); | |
const elem = try tryParseElement(&ctx, std.debug.global_allocator); | |
testing.expectEqualSlices(u8, elem.?.tag, "a"); | |
testing.expectEqualSlices(u8, elem.?.children.at(0).CharData, "b"); | |
testing.expectEqualSlices(u8, elem.?.children.at(1).Element.tag, "c"); | |
testing.expectEqualSlices(u8, elem.?.children.at(2).CharData, "d"); | |
testing.expectEqualSlices(u8, elem.?.children.at(3).Element.tag, "e"); | |
testing.expectEqualSlices(u8, elem.?.children.at(4).CharData, "f"); | |
testing.expectEqualSlices(u8, elem.?.children.at(5).Comment, "g"); | |
} | |
} | |
fn tryParseProlog(ctx: *ParseContext, alloc: *Allocator) !?*XmlDecl { | |
const start = ctx.offset; | |
if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) { | |
ctx.offset = start; | |
return null; | |
} | |
const decl = try alloc.create(XmlDecl); | |
decl.encoding = null; | |
decl.standalone = null; | |
// Version info is mandatory | |
try ctx.expectWs(); | |
try ctx.expectStr("version"); | |
decl.version = try parseEqAttrValue(ctx, alloc); | |
if (ctx.eatWs()) { | |
// Optional encoding and standalone info | |
var require_ws = false; | |
if (ctx.eatStr("encoding")) { | |
decl.encoding = try parseEqAttrValue(ctx, alloc); | |
require_ws = true; | |
} | |
if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) { | |
const standalone = try parseEqAttrValue(ctx, alloc); | |
if (std.mem.eql(u8, standalone, "yes")) { | |
decl.standalone = true; | |
} else if (std.mem.eql(u8, standalone, "no")) { | |
decl.standalone = false; | |
} else { | |
return error.InvalidStandaloneValue; | |
} | |
} | |
_ = ctx.eatWs(); | |
} | |
try ctx.expectStr("?>"); | |
return decl; | |
} | |
test "tryParseProlog" { | |
{ | |
var ctx = ParseContext.init("<?xmla version='aa'?>"); | |
testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, std.debug.global_allocator)); | |
testing.expectEqual(@as(?u8, '<'), ctx.peek()); | |
} | |
{ | |
var ctx = ParseContext.init("<?xml version='aa'?>"); | |
const decl = try tryParseProlog(&ctx, std.debug.global_allocator); | |
testing.expectEqualSlices(u8, "aa", decl.?.version); | |
testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); | |
testing.expectEqual(@as(?bool, null), decl.?.standalone); | |
} | |
{ | |
var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone \t = 'yes'?>"); | |
const decl = try tryParseProlog(&ctx, std.debug.global_allocator); | |
testing.expectEqualSlices(u8, "aa", decl.?.version); | |
testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); | |
testing.expectEqual(@as(?bool, true), decl.?.standalone.?); | |
} | |
} | |
fn tryParseComment(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { | |
if (!ctx.eatStr("<!--")) return null; | |
const begin = ctx.offset; | |
while (!ctx.eatStr("-->")) { | |
_ = ctx.consume() catch return error.UnclosedComment; | |
} | |
const end = ctx.offset - "-->".len; | |
return try mem.dupe(alloc, u8, ctx.source[begin .. end]); | |
} | |
fn unescapeEntity(text: []const u8) !u8 { | |
const EntitySubstition = struct { | |
text: []const u8, | |
replacement: u8 | |
}; | |
const entities = [_]EntitySubstition{ | |
.{.text = "<", .replacement = '<'}, | |
.{.text = ">", .replacement = '>'}, | |
.{.text = "&", .replacement = '&'}, | |
.{.text = "'", .replacement = '\''}, | |
.{.text = """, .replacement = '"'} | |
}; | |
for (entities) |entity| { | |
if (std.mem.eql(u8, text, entity.text)) return entity.replacement; | |
} | |
return error.InvalidEntity; | |
} | |
fn dupeAndUnescape(alloc: *Allocator, text: []const u8) ![]const u8 { | |
const str = try alloc.alloc(u8, text.len); | |
var j: usize = 0; | |
var i: usize = 0; | |
while (i < text.len) : (j += 1) { | |
if (text[i] == '&') { | |
const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity); | |
str[j] = try unescapeEntity(text[i .. entity_end]); | |
i = entity_end; | |
} else { | |
str[j] = text[i]; | |
i += 1; | |
} | |
} | |
return alloc.shrink(str, j); | |
} | |
test "dupeAndUnescape" { | |
testing.expectEqualSlices(u8, "test", try dupeAndUnescape(std.debug.global_allocator, "test")); | |
testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try dupeAndUnescape(std.debug.global_allocator, "a<b&c>d"e'f<")); | |
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&")); | |
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&&")); | |
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&test;")); | |
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&boa")); | |
} |
Hi, this gist is actually an old version that i provided here for reference a long time ago. A more proper version of this parser is actually used as part of vulkan-zig here: https://github.com/Snektron/vulkan-zig/blob/master/generator/xml.zig. In the mean time I have also made a bunch of upgrades to make it a little more efficient
Thank you for the info 👍
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, thank you for providing this XML parser. I recently updated this gist to work with latest version of Zig (0.10.1). Would you be interested to update your gist with those changes so more people can find a solution to their XML parsing needs?