Created
September 27, 2016 21:12
-
-
Save eddroid/834f99a2c26d4b1109f598404c992040 to your computer and use it in GitHub Desktop.
broken HTML parser FSM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Node | |
def initialize(xml_str) | |
@name = @attributes = @content = '' | |
parse(xml_str) | |
p @name, @attributes, @content | |
end | |
def parse(xml_str) | |
# state indicators | |
inside_tag = inside_tag_name = inside_tag_attributes = false | |
xml_str.each_char do |char| | |
if char == "<" | |
if not inside_tag | |
# state: beginning of tag name | |
inside_tag = true | |
inside_tag_name = true | |
inside_tag_attributes = false | |
end | |
elsif char == ">" | |
if inside_tag | |
# state: end of tag | |
inside_tag = false | |
inside_tag_name = false | |
inside_tag_attributes = false | |
end | |
elsif char == ' ' | |
if inside_tag_name | |
# state: space between tag name and attribs (or between attribs) | |
inside_tag_name = false | |
inside_tag_attributes = true | |
end | |
elsif char == '/' | |
if inside_tag | |
# state: beginning of closing tag name | |
inside_tag_name = false | |
inside_tag_attributes = false | |
end | |
else | |
if inside_tag | |
if inside_tag_name | |
# state: reading tag name | |
@name += char | |
elsif inside_tag_attributes | |
# state: reading attributes | |
@attributes += char | |
end | |
else | |
# state: reading contents | |
@content += char | |
end | |
end | |
end | |
end | |
end | |
string = "<root attrib='value' attrib2=\"value2\">text</root>" | |
node = Node.new(string) | |
p node |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment