Skip to content

Instantly share code, notes, and snippets.

@v9n
Last active September 4, 2018 09:59
Show Gist options
  • Save v9n/bf35ae3ca42958fed39aa63e07b7f302 to your computer and use it in GitHub Desktop.
Save v9n/bf35ae3ca42958fed39aa63e07b7f302 to your computer and use it in GitHub Desktop.
simple_xml.cr
# A toy XML parser to demo Crystal without ever reading XML RFC :-(
# So this will absolutely contain bugs
# This implementation use a state machine and may perform badly :)
# Maybe a stack based will be faster
module ZeroXML
VERSION = "0.1.0"
enum State
OpenTag
NameTag
CloseTag
StartAttbName
NameAttbName
EndAttbName
StartAttbValue
EndAttbValue
Comment
Data
CData
end
enum TokenType
Prolog
OpenTag
CloseTag
AttbName
AttbValue
Text
end
# TODO: Add line, col
alias Token = NamedTuple(name: TokenType, value: String)
class Element
property tag : String?
property text : String?
property children : Array(Element)
property attbs : Hash(String, String)
def initialize(@tag : String?, @text : String?)
@children = [] of Element
@attbs = {} of String => String
end
end
class Doc < Element
@prologue : Hash(String, String)?
end
class Parser
class InvalidToken < Exception; end
class TagMisMatch < Exception; end
getter position : Int64
getter blob : String
def initialize(@blob : String)
@position = 0
@column = 0
@line = 0
@tokens = [] of Token
@blob = @blob.strip("\n ")
end
LIT_EQUAL = '='
LIT_TAG_OPEN = '<'
LIT_TAG_CLOSE = '>'
LIT_SLASH = '/'
LIT_SPACE = ' '
LIT_TAB = '\t'
LIT_NEWLINE = '\n'
LIT_QUOTE = '"'
LIT_SQUOTE = '\''
LIT_EXCLAMATION = '!'
LIT_PROLOG_START = "<?"
LIT_PROLOG_END = "?>"
LIT_COMMENT_START = "!--"
LIT_COMMENT_END = "-->"
LIT_CDATA_START = "![CDATA["
LIT_CDATA_END = "]]>"
# TODO: Add line, column
def lex
data = ""
tag_name = ""
attb_name = ""
attb_value = ""
is_close_tag = false
state = State::Data
is_open_quote = false
@position = -1
while has_more
case state
when State::CData
if blob[position..position+2] == LIT_CDATA_END
@position = position + 2
state = State::Data
else
data += ch
end
when State::Comment
if blob[position..position+2] == LIT_COMMENT_END
@position = position + 2
state = State::Data
next
end
when State::OpenTag
case ch
when LIT_SPACE, LIT_TAB, LIT_NEWLINE
noop
when LIT_TAG_CLOSE,LIT_QUOTE, LIT_SQUOTE
raise InvalidToken.new("Invalid token at #{position}")
when LIT_SLASH
state = State::CloseTag
is_close_tag = true
when LIT_EXCLAMATION
if blob[position..position+2] == LIT_COMMENT_START
state = State::Comment
end
if blob[position..position+7] == LIT_CDATA_START
state = State::CData
@position += 7
end
else
state = State::NameTag
tag_name += ch
end
when State::NameTag
case ch
when LIT_SPACE, LIT_NEWLINE
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name }
tag_name = ""
state = State::StartAttbName
when LIT_TAG_CLOSE
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name }
tag_name = ""
state = State::Data
when LIT_SLASH
# TODO
else
tag_name += ch
end
when State::CloseTag
case ch
when LIT_TAG_CLOSE
@tokens << { name: is_close_tag ? TokenType::CloseTag : TokenType::OpenTag, value: tag_name }
tag_name = ""
state = State::Data
else
tag_name += ch
end
when State::StartAttbName
case ch
when LIT_SPACE, LIT_TAB, LIT_NEWLINE
noop
when LIT_QUOTE, LIT_SQUOTE
raise InvalidToken.new("Invalid quote for attb name at #{position}")
else
state = State::NameAttbName
attb_name += ch
end
when State::NameAttbName
case ch
when LIT_QUOTE, LIT_SQUOTE
raise InvalidToken.new("Attb name has invalid quote at #{position}")
when LIT_SPACE, LIT_NEWLINE, LIT_TAB, LIT_EQUAL
@tokens << { name: TokenType::AttbName, value: attb_name }
attb_name = ""
state = ch == LIT_EQUAL ? State::StartAttbValue : State::EndAttbName
when LIT_TAG_CLOSE
state = State::Data
when LIT_SLASH
noop
else
attb_name += ch
end
when State::EndAttbName
case ch
when LIT_SPACE, LIT_NEWLINE, LIT_TAB
noop
when LIT_TAG_CLOSE
state = State::Data
when LIT_SLASH
noop
else
state = State::StartAttbName
attb_name += ch
end
when State::StartAttbValue
case ch
when LIT_TAB, LIT_NEWLINE, LIT_SPACE
raise InvalidToken.new("Quote is expected at #{@position}")
when LIT_QUOTE, LIT_SQUOTE
if is_open_quote
state = State::EndAttbValue
is_open_quote = false
@tokens << { name: TokenType::AttbValue, value: attb_value }
attb_value = ""
else
is_open_quote = true
end
else
attb_value += ch
end
when State::EndAttbValue
case ch
when LIT_TAB, LIT_NEWLINE, LIT_SPACE
noop
when LIT_SLASH
# TODO
when LIT_TAG_CLOSE
state = State::Data
else
state = State::StartAttbName
attb_name += ch
end
# Data is default case
when State::Data
case ch
when LIT_TAG_OPEN
if data != ""
@tokens << { name: TokenType::Text, value: data }
end
state = State::OpenTag
tag_name = ""
data = ""
is_close_tag = false
when LIT_TAG_CLOSE
raise InvalidToken.new("Invalid token at #{@position}")
else
data += ch
end
end
end
@tokens
end
# Given a list of token
def parse
lex
stack = [] of Element
doc : Element? = nil
last_attb = ""
@tokens.each do |token|
case token[:name]
when TokenType::OpenTag
node = Element.new(token[:value], "")
if stack.size == 0
doc = node
else
stack.last.children << node
end
stack << node
when TokenType::CloseTag
if stack.last.tag != token[:value]
raise TagMisMatch.new("Open/Close tags are mismatched")
else
stack.pop
end
when TokenType::AttbName
last_attb = token[:value]
stack.last.attbs[last_attb] = ""
when TokenType::AttbValue
stack.last.attbs[last_attb] = token[:value]
when TokenType::Text
stack.last.text = token[:value]
end
end
doc
end
def print_tree(tree)
pp tree
end
private def noop
end
private def ch
@blob[@position]
end
private def has_more
@position += 1
position < blob.size
end
end
def self.parse(blob : String)
p = Parser.new(blob)
tree = p.parse
p.print_tree(tree)
end
end
raw_doc = <<-xml
<breakfast_menu>
<food>
<!-- this is a comment -->
<name>belgian waffles</name>
<price>$5.95</price>
<description>
two of our famous belgian waffles with plenty of real maple syrup
</description>
<calories>650</calories>
</food>
<food country="us">
<name>berry-berry belgian waffles</name>
<price>$8.95</price>
<description>
<![CDATA[
belgian waffles covered with <strong>assorted fresh berries</strong> and whipped cream
]]>
</description>
<calories>900</calories>
</food>
<food country="french" healthy="good">
<name>homestyle breakfast</name>
<price>$6.95</price>
<description>
two eggs, bacon or sausage, toast, and our ever-popular hash browns
</description>
<calories>950</calories>
</food>
</breakfast_menu>
xml
ZeroXML.parse(raw_doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment