Skip to content

Instantly share code, notes, and snippets.

@isocroft
Last active November 17, 2025 12:55
Show Gist options
  • Select an option

  • Save isocroft/6f282a93cd699b8bc22ff8dea59d0ffd to your computer and use it in GitHub Desktop.

Select an option

Save isocroft/6f282a93cd699b8bc22ff8dea59d0ffd to your computer and use it in GitHub Desktop.
A partially spec-compliant html parser written in python just for fun while reading the WEB BROWSER ENGINEERING book by Pavel & Chris
import sys
import codecs
import requests
from collections import deque
class Element:
def __init__(self, tag, attributes, parentNode):
# if pre-condition(s) return False, AssertionError is raised:
assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"
assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"
self.tag = tag
self.attributes = attributes
self.children = []
self.parentNode = parentNode
def __repr__(self):
return "<" + self.tag + ">"
class TextNode:
def __init__(self, text, parentNode):
# if pre-condition(s) return False, AssertionError is raised:
assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"
self.text = text
self.children = []
self.parentNode = parentNode
def __repr__(self):
return repr(self.text)
class HTMLParser:
SELF_CLOSING_TAGS = [
"keygen", "area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr"
]
HEAD_TAGS = [
"base", "basefont", "bgsound", "noscript",
"link", "meta", "title", "style", "script",
]
def __init__(self, encoding=None):
if self.encoding is None:
self.encoding = 'utf-8'
elif type(encoding) == str:
self.encoding = encoding
else:
self.encoding = 'utf-8'
self.bytesList = deque(bytearray(20)) # A Byte Queue (size=20)
self.unfinishedNodesList = [] # A Node Stack
# parser flags/sentinels
self.FLAG_IN_ATTR_VALUE = False
self.FLAG_IN_TAG = False
# parser token buffers
self.TOKEN_TEXT = ""
self.TOKEN_TAG_WITH_ATTRS = ""
# if post-condition(s) return False, AssertionError is raised:
assert len(self.bytesList) == 20, "`self.bytesList` is not a byte queue of size 20 - invalid setup encountered"
assert type(self.encoding) == str, "`self.encoding` is not a string - invalid setup encountered"
# assert self.FLAG_IN_TAG == False, "parser should not be collecting tag from input string/bytes at this point (object instantiation) - invalid setup encountered"
# assert self.FLAG_IN_ATTR_VALUE == False, "parser should not be collecting tag attributes from input string/bytes at this point (object instantiation) - invalid setup encountered"
# assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (object instantiation) - invalid setup encountered"
# assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (object instantiation) - invalid setup encountered""
# assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (object instantiation) - invalid setup encountered"
def add_implicit_tags(self, tag):
# if pre-condition returns False, AssertionError is raised:
assert tag == None or type(tag) == str, "`tag` argument is not a string or 'None' - invalid argument encountered"
while True:
# read in the tag at the top of the stack of unfinished nodes
newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag
if newestUnfinishedTag == None:
if tag not in ["html"]:
self.add_tag("html", {})
elif newestUnfinishedTag == "html" and len(self.unfinishedNodesList) == 1:
if tag not in ["head", "body", "/html"] and tag in self.HEAD_TAGS:
self.add_tag("head", {})
if tag not in ["body", "/body", "/html"] and tag not in self.HEAD_TAGS:
self.add_tag("body", {})
elif newestUnfinishedTag == "head" and len(self.unfinishedNodesList) == 2:
if tag not in ["/head"] and tag not in self.HEAD_TAGS:
self.add_tag("/head", {})
elif newestUnfinishedTag == "body" and len(self.unfinishedNodesList) == 2:
break
# if post-condition(s) return False, AssertionError is raised:
assert len(self.unfinishedNodesList) > 0, "parser: stack of open tags should contain all injected implicit tags - invalid state encountered"
assert (self.unfinishedNodesList[-1]).tag == "body", "parser: stack of open tags should contain the node for <body> tag at the top of the stack - invalid state encountered"
def add_text(self, text):
sizeOfStack = len(self.unfinishedNodesList)
# if pre-condition(s) return False, AssertionError is raised:
assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
assert len(text) > 0, "`text` argument is a string but contains nothing - invalid argument encountered"
if text.isspace():
return False
if len(self.unfinishedNodesList) == 0:
self.add_implicit_tags(None)
parentNode = self.unfinishedNodesList[-1]
childNode = Element("#text", {}, parentNode)
childNode.children.append(
Text(text, childNode)
)
parentNode.children.append(childNode)
# if post-condition returns False, AssertionError is raised:
assert len(self.unfinishedNodesList) == sizeOfStack, "parser: stack of open tags has entered an invalid state - invalid state encountered"
def add_tag(self, tag, attributes):
# if pre-condition(s) return False, AssertionError is raised:
assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"
# Only add implicit tags when the tag to be added is not the <html> tag and the stack of open tags is empty
if len(self.unfinishedNodesList) == 0 and tag != "html":
self.add_implicit_tags(tag)
# Deal with closing tags for non-self closing tags (e.g. head, body, span, div)
if tag.startswith("/"):
if len(self.unfinishedNodesList) == 1:
return False
childNode = self.unfinishedNodesList.pop()
parentNode = self.unfinishedNodesList[-1]
parentNode.children.append(childNode)
return True
elif tag in self.SELF_CLOSING_TAGS:
parentNode = self.unfinishedNodesList[-1]
childNode = Element(tag, attributes, parentNode)
parentNode.children.append(childNode)
return True
else:
parentNode = self.unfinishedNodesList[-1] if len(self.unfinishedNodesList) > 0 else None
childNode = Element(tag, attributes, parentNode)
self.unfinishedNodeList.append(childNode)
return True
def parseSingleByteInTurn (self):
sizeOfQueue = len(self.bytesList)
# if pre-condition returns False, AssertionError is raised:
assert sizeOfQueue > 0, "`parseSingleByteInTurn()` method should not be called without one or more bytes in the queue - invalid callee invocation"
if not self.bytesList:
return False
byteString = self.bytesList.popleft()
unicodeCharString = codecs.decode(byteString, self.encoding)
self.consumeCharacterFromInput(unicodeCharString)
# if post-condition(s) return False, AssertionError is raised:
assert len(self.bytesList) == sizeOfQueue - 1, "parser: queue of bytes has entered an invalid state - invalid state encountered"
return True
def consumeCharacterFromInput (self, charString):
# if pre-condition returns False, AssertionError is raised:
assert type(charString) == str and len(charString) == 1, "`charString` argument is not a character string - invalid argument encountered"
if charString == "<":
self.FLAG_IN_TAG = True
if self.TOKEN_TEXT != "":
text = self.TOKEN_TEXT
self.add_text(text)
self.TOKEN_TEXT = ""
elif charString == ">":
self.FLAG_IN_TAG = False
MAX_BACKTRACK_COUNT = 10
BACKTRACK_COUNT = 0
self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.lower()
# remove trailing solidus for both self-closing tags and malformed closing tags
if self.TOKEN_TAG_WITH_ATTRS.endswith("/"):
self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS[0:-1]
self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.strip()
# ignore comment tags and the doctype tag (for now...)
if self.TOKEN_TAG_WITH_ATTRS.startswith("!"):
self.TOKEN_TAG_WITH_ATTRS = ""
return
# read in the tag at the top of the stack of unfinished nodes
newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag
if newestUnfinishedTag == "script" or newestUnfinishedTag == "style":
if not self.TOKEN_TAG_WITH_ATTRS.startswith("/script") or not self.TOKEN_TAG_WITH_ATTRS.startswith("/style"):
self.TOKEN_TAG_WITH_ATTRS = ""
return
tag = ""
subtokens = []
if " " in self.TOKEN_TAG_WITH_ATTRS:
subtokens = self.TOKEN_TAG_WITH_ATTRS.split(None, 1)
tag = subtokens[0]
else:
tag = self.TOKEN_TAG_WITH_ATTRS
if not tag.startswith("/"):
if tag == "li" and not newestUnfinishedTag in ["ul", "ol"]:
self.TOKEN_TAG_WITH_ATTRS = ""
return
if tag == newestUnfinishedTag:
self.add_tag("/" + newestUnfinishedTag, {})
attributes = self.parseTagAttributesString(subtokens[1] if len(subtokens) > 1 else "")
self.add_tag(tag, attributes)
else:
lastParentTagProcessed = ""
tagWithoutSlash = tag[1:]
while newestUnfinishedTag != None and tagWithoutSlash != newestUnfinishedTag:
if BACKTRACK_COUNT >= MAX_BACKTRACK_COUNT:
break
parentNode = self.unfinishedNodesList.pop()
if (self.unfinishedNodesList[-1]).tag != tagWithoutSlash:
self.unfinishedNodesList.append(parentNode)
self.add_tag("/" + parentNode.tag, {})
else:
self.unfinishedNodesList.append(parentNode)
lastParentTagProcessed = parentNode.tag
BACKTRACK_COUNT += 1
newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag
if lastParentTagProcessed != "":
self.add_tag("/" + lastParentTagProcessed, {})
if BACKTRACK_COUNT == 0:
self.add_tag(tag, {})
if lastParentTagProcessed != "":
self.add_tag(lastParentTagProcessed, {})
BACKTRACK_COUNT = 0
self.TOKEN_TAG_WITH_ATTRS = ""
else:
if self.FLAG_IN_TAG:
self.TOKEN_TAG_WITH_ATTRS += charString
else:
if charString.isspace() and self.TOKEN_TEXT != "":
self.TOKEN_TEXT += charString
def parseTagAttributesString (self, attributesString):
attributes = {}
valueCache = ""
nameCache = ""
TOKEN_ATTR_NAME = ""
TOKEN_ATTR_VALUE = ""
TOKEN_ATTR_QUOTE = ""
ESCAPE_SENTINEL_ACTIVE = False
#if pre-condition(s) return False, AssertionError is raised:
assert type(attributesString) == str, "`attributeString` argument is not a string of html markup - invalid argument encountered"
assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"
if len(attributesString) == 0:
return attributes
for char in attributesString:
if char.isaplha():
if not self.FLAG_IN_ATTR_VALUE:
TOKEN_ATTR_NAME += char
else:
TOKEN_ATTR_VALUE += char
elif char == "=":
nameCache = TOKEN_ATTR_NAME
TOKEN_ATTR_NAME = ""
self.FLAG_IN_ATTR_VALUE = True
elif char == "\\":
ESCAPE_SENTINEL_ACTIVE = True
TOKEN_ATTR_VALUE += char
elif char == "'" or char == "\"":
if not self.FLAG_IN_ATTR_VALUE:
continue
if TOKEN_ATTR_VALUE == "":
TOKEN_ATTR_QUOTE = char
else:
if char != TOKEN_ATTR_QUOTE:
TOKEN_ATTR_VALUE += char
elif ESCAPE_SENTINEL_ACTIVE:
ESCAPE_SENTINEL_ACTIVE = False
TOKEN_ATTR_VALUE += char
else:
valueCache = TOKEN_ATTR_VALUE.strip()
TOKEN_ATTR_VALUE = ""
TOKEN_ATTR_QUOTE = ""
self.FLAG_IN_ATTR_VALUE = False
elif char.isspace():
if self.FLAG_IN_ATTR_VALUE:
if TOKEN_ATTR_VALUE != "":
valueCache = TOKEN_ATTR_VALUE.strip()
TOKEN_ATTR_VALUE = ""
TOKEN_ATTR_QUOTE = ""
self.FLAG_IN_ATTR_VALUE = False
else:
TOKEN_ATTR_VALUE += char
else:
if nameCache != "":
if nameCache in attributes:
continue
attributes[nameCache] = valueCache
nameCache = ""
valueCache = ""
elif TOKEN_ATTR_NAME != "":
if TOKEN_ATTR_NAME in attributes:
continue
attributes[TOKEN_ATTR_NAME] = valueCache
TOKEN_ATTR_NAME = ""
else:
if not self.FLAG_IN_ATTR_VALUE:
continue
else:
TOKEN_ATTR_VALUE += char
if TOKEN_ATTR_VALUE != "":
if nameCache != "":
if nameCache not in attributes:
attributes[nameCache] = TOKEN_ATTR_VALUE
nameCache = ""
TOKEN_ATTR_VALUE = ""
elif TOKEN_ATTR_NAME != "":
if TOKEN_ATTR_NAME not in attributes:
attributes[TOKEN_ATTR_NAME] = TOKEN_ATTR_VALUE
TOKEN_ATTR_NAME = ""
valueCache = ""
# if post-condition(s) return False, AssertionError is raised:
assert nameCache == "", "variables used to cache html attribute names should be returned to their initial state - termination failure encountered"
assert valueCache == "", "variables used to cache html attribute values should be returned to their initial state - termination failure encountered"
assert TOKEN_ATTR_QUOTE == "", "html attributes quote sentinel helper must be cleared after parsing of attributes is completed - termination failure encountered"
assert TOKEN_ATTR_NAME == "", "all html attribute names should be collected - termination failure encountered"
assert TOKEN_ATTR_VALUE == "", "all html attribute values should be collected - termination failure encountered"
assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point - invalid state encountered"
return attributes
def parseHTMLString(self, htmlString):
# if pre-condition(s) return False, AssertionError is raised:
assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"
assert type(htmlString) == str, "`parseHTMLString(...)` method can only parse a string of html markup - invalid argument encountered"
for char in htmlString:
self.consumeCharacterFromInput(char)
# if post-condition(s) return False, AssertionError is raised:
assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (EOF) - invalid state encountered"
assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point (EOF) - invalid state encountered"
return self.finish()
def finish(self):
if not self.FLAG_IN_TAG and self.TOKEN_TEXT:
text = self.TOKEN_TEXT
self.add_text(text)
self.TOKEN_TEXT = ""
if len(self.unfinishedNodesList) == 0:
self.add_implicit_tags(None)
while len(self.unfinishedNodesList) > 1:
childNode = self.unfinishedNodesList.pop()
parentNode = self.unfinishedNodesList[-1]
parentNode.children.append(childNode)
rootNode = self.unfinishedNodesList.pop()
#if post-condition(s) return False, AssertionError is raised:
assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (EOL) - invalid state encountered"
assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (EOL) - invalid state encountered"
return rootNode
def print_tree(node, indent=0):
print(" " * indent, node)
for child in node.children:
print_tree(child, indent + 2)
"""
@USAGE: Fetch the HTML string via a HTTP GET request.
Parse the HTML string and print the tree nodes.
"""
response = requests.get(sys.argv[1])
body = response.text
rootNode = HTMLParser().parseHTMLString(body)
print_tree(rootNode)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment