Last active
September 4, 2018 09:59
-
-
Save v9n/bf35ae3ca42958fed39aa63e07b7f302 to your computer and use it in GitHub Desktop.
simple_xml.cr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A toy XML parser to demo Crystal without ever reading XML RFC :-( | |
# So this will absolutely contain bugs | |
# This implementation use a state machine and may perform badly :) | |
# Maybe a stack based will be faster | |
module ZeroXML | |
VERSION = "0.1.0" | |
enum State | |
OpenTag | |
NameTag | |
CloseTag | |
StartAttbName | |
NameAttbName | |
EndAttbName | |
StartAttbValue | |
EndAttbValue | |
Comment | |
Data | |
CData | |
end | |
enum TokenType | |
Prolog | |
OpenTag | |
CloseTag | |
AttbName | |
AttbValue | |
Text | |
end | |
# TODO: Add line, col | |
alias Token = NamedTuple(name: TokenType, value: String) | |
class Element | |
property tag : String? | |
property text : String? | |
property children : Array(Element) | |
property attbs : Hash(String, String) | |
def initialize(@tag : String?, @text : String?) | |
@children = [] of Element | |
@attbs = {} of String => String | |
end | |
end | |
class Doc < Element | |
@prologue : Hash(String, String)? | |
end | |
class Parser | |
class InvalidToken < Exception; end | |
class TagMisMatch < Exception; end | |
getter position : Int64 | |
getter blob : String | |
def initialize(@blob : String) | |
@position = 0 | |
@column = 0 | |
@line = 0 | |
@tokens = [] of Token | |
@blob = @blob.strip("\n ") | |
end | |
LIT_EQUAL = '=' | |
LIT_TAG_OPEN = '<' | |
LIT_TAG_CLOSE = '>' | |
LIT_SLASH = '/' | |
LIT_SPACE = ' ' | |
LIT_TAB = '\t' | |
LIT_NEWLINE = '\n' | |
LIT_QUOTE = '"' | |
LIT_SQUOTE = '\'' | |
LIT_EXCLAMATION = '!' | |
LIT_PROLOG_START = "<?" | |
LIT_PROLOG_END = "?>" | |
LIT_COMMENT_START = "!--" | |
LIT_COMMENT_END = "-->" | |
LIT_CDATA_START = "![CDATA[" | |
LIT_CDATA_END = "]]>" | |
# TODO: Add line, column | |
def lex | |
data = "" | |
tag_name = "" | |
attb_name = "" | |
attb_value = "" | |
is_close_tag = false | |
state = State::Data | |
is_open_quote = false | |
@position = -1 | |
while has_more | |
case state | |
when State::CData | |
if blob[position..position+2] == LIT_CDATA_END | |
@position = position + 2 | |
state = State::Data | |
else | |
data += ch | |
end | |
when State::Comment | |
if blob[position..position+2] == LIT_COMMENT_END | |
@position = position + 2 | |
state = State::Data | |
next | |
end | |
when State::OpenTag | |
case ch | |
when LIT_SPACE, LIT_TAB, LIT_NEWLINE | |
noop | |
when LIT_TAG_CLOSE,LIT_QUOTE, LIT_SQUOTE | |
raise InvalidToken.new("Invalid token at #{position}") | |
when LIT_SLASH | |
state = State::CloseTag | |
is_close_tag = true | |
when LIT_EXCLAMATION | |
if blob[position..position+2] == LIT_COMMENT_START | |
state = State::Comment | |
end | |
if blob[position..position+7] == LIT_CDATA_START | |
state = State::CData | |
@position += 7 | |
end | |
else | |
state = State::NameTag | |
tag_name += ch | |
end | |
when State::NameTag | |
case ch | |
when LIT_SPACE, LIT_NEWLINE | |
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name } | |
tag_name = "" | |
state = State::StartAttbName | |
when LIT_TAG_CLOSE | |
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name } | |
tag_name = "" | |
state = State::Data | |
when LIT_SLASH | |
# TODO | |
else | |
tag_name += ch | |
end | |
when State::CloseTag | |
case ch | |
when LIT_TAG_CLOSE | |
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name } | |
tag_name = "" | |
state = State::Data | |
else | |
tag_name += ch | |
end | |
when State::StartAttbName | |
case ch | |
when LIT_SPACE, LIT_TAB, LIT_NEWLINE | |
noop | |
when LIT_QUOTE, LIT_SQUOTE | |
raise InvalidToken.new("Invalid quote for attb name at #{position}") | |
else | |
state = State::NameAttbName | |
attb_name += ch | |
end | |
when State::NameAttbName | |
case ch | |
when LIT_QUOTE, LIT_SQUOTE | |
raise InvalidToken.new("Attb name has invalid quote at #{position}") | |
when LIT_SPACE, LIT_NEWLINE, LIT_TAB, LIT_EQUAL | |
@tokens << { name: TokenType::AttbName, value: attb_name } | |
attb_name = "" | |
state = ch == LIT_EQUAL ? State::StartAttbValue : State::EndAttbName | |
when LIT_TAG_CLOSE | |
state = State::Data | |
when LIT_SLASH | |
noop | |
else | |
attb_name += ch | |
end | |
when State::EndAttbName | |
case ch | |
when LIT_SPACE, LIT_NEWLINE, LIT_TAB | |
noop | |
when LIT_TAG_CLOSE | |
state = State::Data | |
when LIT_SLASH | |
noop | |
else | |
state = State::StartAttbName | |
attb_name += ch | |
end | |
when State::StartAttbValue | |
case ch | |
when LIT_TAB, LIT_NEWLINE, LIT_SPACE | |
raise InvalidToken.new("Quote is expected at #{@position}") | |
when LIT_QUOTE, LIT_SQUOTE | |
if is_open_quote | |
state = State::EndAttbValue | |
is_open_quote = false | |
@tokens << { name: TokenType::AttbValue, value: attb_value } | |
attb_value = "" | |
else | |
is_open_quote = true | |
end | |
else | |
attb_value += ch | |
end | |
when State::EndAttbValue | |
case ch | |
when LIT_TAB, LIT_NEWLINE, LIT_SPACE | |
noop | |
when LIT_SLASH | |
# TODO | |
when LIT_TAG_CLOSE | |
state = State::Data | |
else | |
state = State::StartAttbName | |
attb_name += ch | |
end | |
# Data is default case | |
when State::Data | |
case ch | |
when LIT_TAG_OPEN | |
if data != "" | |
@tokens << { name: TokenType::Text, value: data } | |
end | |
state = State::OpenTag | |
tag_name = "" | |
data = "" | |
is_close_tag = false | |
when LIT_TAG_CLOSE | |
raise InvalidToken.new("Invalid token at #{@position}") | |
else | |
data += ch | |
end | |
end | |
end | |
@tokens | |
end | |
# Given a list of token | |
def parse | |
lex | |
stack = [] of Element | |
doc : Element? = nil | |
last_attb = "" | |
@tokens.each do |token| | |
case token[:name] | |
when TokenType::OpenTag | |
node = Element.new(token[:value], "") | |
if stack.size == 0 | |
doc = node | |
else | |
stack.last.children << node | |
end | |
stack << node | |
when TokenType::CloseTag | |
if stack.last.tag != token[:value] | |
raise TagMisMatch.new("Open/Close tags are mismatched") | |
else | |
stack.pop | |
end | |
when TokenType::AttbName | |
last_attb = token[:value] | |
stack.last.attbs[last_attb] = "" | |
when TokenType::AttbValue | |
stack.last.attbs[last_attb] = token[:value] | |
when TokenType::Text | |
stack.last.text = token[:value] | |
end | |
end | |
doc | |
end | |
def print_tree(tree) | |
pp tree | |
end | |
private def noop | |
end | |
private def ch | |
@blob[@position] | |
end | |
private def has_more | |
@position += 1 | |
position < blob.size | |
end | |
end | |
def self.parse(blob : String) | |
p = Parser.new(blob) | |
tree = p.parse | |
p.print_tree(tree) | |
end | |
end | |
raw_doc = <<-xml | |
<breakfast_menu> | |
<food> | |
<!-- this is a comment --> | |
<name>belgian waffles</name> | |
<price>$5.95</price> | |
<description> | |
two of our famous belgian waffles with plenty of real maple syrup | |
</description> | |
<calories>650</calories> | |
</food> | |
<food country="us"> | |
<name>berry-berry belgian waffles</name> | |
<price>$8.95</price> | |
<description> | |
<![CDATA[ | |
belgian waffles covered with <strong>assorted fresh berries</strong> and whipped cream | |
]]> | |
</description> | |
<calories>900</calories> | |
</food> | |
<food country="french" healthy="good"> | |
<name>homestyle breakfast</name> | |
<price>$6.95</price> | |
<description> | |
two eggs, bacon or sausage, toast, and our ever-popular hash browns | |
</description> | |
<calories>950</calories> | |
</food> | |
</breakfast_menu> | |
xml | |
ZeroXML.parse(raw_doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment