Last active
November 17, 2025 12:55
-
-
Save isocroft/6f282a93cd699b8bc22ff8dea59d0ffd to your computer and use it in GitHub Desktop.
A partially spec-compliant html parser written in python just for fun while reading the WEB BROWSER ENGINEERING book by Pavel & Chris
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import codecs | |
| import requests | |
| from collections import deque | |
| class Element: | |
| def __init__(self, tag, attributes, parentNode): | |
| # if pre-condition(s) return False, AssertionError is raised: | |
| assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered" | |
| assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered" | |
| assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered" | |
| self.tag = tag | |
| self.attributes = attributes | |
| self.children = [] | |
| self.parentNode = parentNode | |
| def __repr__(self): | |
| return "<" + self.tag + ">" | |
| class TextNode: | |
| def __init__(self, text, parentNode): | |
| # if pre-condition(s) return False, AssertionError is raised: | |
| assert type(text) == str, "`text` argument is not a string - invalid argument encountered" | |
| assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered" | |
| self.text = text | |
| self.children = [] | |
| self.parentNode = parentNode | |
| def __repr__(self): | |
| return repr(self.text) | |
| class HTMLParser: | |
| SELF_CLOSING_TAGS = [ | |
| "keygen", "area", "base", "br", "col", "embed", "hr", "img", "input", | |
| "link", "meta", "param", "source", "track", "wbr" | |
| ] | |
| HEAD_TAGS = [ | |
| "base", "basefont", "bgsound", "noscript", | |
| "link", "meta", "title", "style", "script", | |
| ] | |
| def __init__(self, encoding=None): | |
| if self.encoding is None: | |
| self.encoding = 'utf-8' | |
| elif type(encoding) == str: | |
| self.encoding = encoding | |
| else: | |
| self.encoding = 'utf-8' | |
| self.bytesList = deque(bytearray(20)) # A Byte Queue (size=20) | |
| self.unfinishedNodesList = [] # A Node Stack | |
| # parser flags/sentinels | |
| self.FLAG_IN_ATTR_VALUE = False | |
| self.FLAG_IN_TAG = False | |
| # parser token buffers | |
| self.TOKEN_TEXT = "" | |
| self.TOKEN_TAG_WITH_ATTRS = "" | |
| # if post-condition(s) return False, AssertionError is raised: | |
| assert len(self.bytesList) == 20, "`self.bytesList` is not a byte queue of size 20 - invalid setup encountered" | |
| assert type(self.encoding) == str, "`self.encoding` is not a string - invalid setup encountered" | |
| # assert self.FLAG_IN_TAG == False, "parser should not be collecting tag from input string/bytes at this point (object instantiation) - invalid setup encountered" | |
| # assert self.FLAG_IN_ATTR_VALUE == False, "parser should not be collecting tag attributes from input string/bytes at this point (object instantiation) - invalid setup encountered" | |
| # assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (object instantiation) - invalid setup encountered" | |
| # assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (object instantiation) - invalid setup encountered"" | |
| # assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (object instantiation) - invalid setup encountered" | |
| def add_implicit_tags(self, tag): | |
| # if pre-condition returns False, AssertionError is raised: | |
| assert tag == None or type(tag) == str, "`tag` argument is not a string or 'None' - invalid argument encountered" | |
| while True: | |
| # read in the tag at the top of the stack of unfinished nodes | |
| newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag | |
| if newestUnfinishedTag == None: | |
| if tag not in ["html"]: | |
| self.add_tag("html", {}) | |
| elif newestUnfinishedTag == "html" and len(self.unfinishedNodesList) == 1: | |
| if tag not in ["head", "body", "/html"] and tag in self.HEAD_TAGS: | |
| self.add_tag("head", {}) | |
| if tag not in ["body", "/body", "/html"] and tag not in self.HEAD_TAGS: | |
| self.add_tag("body", {}) | |
| elif newestUnfinishedTag == "head" and len(self.unfinishedNodesList) == 2: | |
| if tag not in ["/head"] and tag not in self.HEAD_TAGS: | |
| self.add_tag("/head", {}) | |
| elif newestUnfinishedTag == "body" and len(self.unfinishedNodesList) == 2: | |
| break | |
| # if post-condition(s) return False, AssertionError is raised: | |
| assert len(self.unfinishedNodesList) > 0, "parser: stack of open tags should contain all injected implicit tags - invalid state encountered" | |
| assert (self.unfinishedNodesList[-1]).tag == "body", "parser: stack of open tags should contain the node for <body> tag at the top of the stack - invalid state encountered" | |
| def add_text(self, text): | |
| sizeOfStack = len(self.unfinishedNodesList) | |
| # if pre-condition(s) return False, AssertionError is raised: | |
| assert type(text) == str, "`text` argument is not a string - invalid argument encountered" | |
| assert len(text) > 0, "`text` argument is a string but contains nothing - invalid argument encountered" | |
| if text.isspace(): | |
| return False | |
| if len(self.unfinishedNodesList) == 0: | |
| self.add_implicit_tags(None) | |
| parentNode = self.unfinishedNodesList[-1] | |
| childNode = Element("#text", {}, parentNode) | |
| childNode.children.append( | |
| Text(text, childNode) | |
| ) | |
| parentNode.children.append(childNode) | |
| # if post-condition returns False, AssertionError is raised: | |
| assert len(self.unfinishedNodesList) == sizeOfStack, "parser: stack of open tags has entered an invalid state - invalid state encountered" | |
| def add_tag(self, tag, attributes): | |
| # if pre-condition(s) return False, AssertionError is raised: | |
| assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered" | |
| assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered" | |
| # Only add implicit tags when the tag to be added is not the <html> tag and the stack of open tags is empty | |
| if len(self.unfinishedNodesList) == 0 and tag != "html": | |
| self.add_implicit_tags(tag) | |
| # Deal with closing tags for non-self closing tags (e.g. head, body, span, div) | |
| if tag.startswith("/"): | |
| if len(self.unfinishedNodesList) == 1: | |
| return False | |
| childNode = self.unfinishedNodesList.pop() | |
| parentNode = self.unfinishedNodesList[-1] | |
| parentNode.children.append(childNode) | |
| return True | |
| elif tag in self.SELF_CLOSING_TAGS: | |
| parentNode = self.unfinishedNodesList[-1] | |
| childNode = Element(tag, attributes, parentNode) | |
| parentNode.children.append(childNode) | |
| return True | |
| else: | |
| parentNode = self.unfinishedNodesList[-1] if len(self.unfinishedNodesList) > 0 else None | |
| childNode = Element(tag, attributes, parentNode) | |
| self.unfinishedNodeList.append(childNode) | |
| return True | |
| def parseSingleByteInTurn (self): | |
| sizeOfQueue = len(self.bytesList) | |
| # if pre-condition returns False, AssertionError is raised: | |
| assert sizeOfQueue > 0, "`parseSingleByteInTurn()` method should not be called without one or more bytes in the queue - invalid callee invocation" | |
| if not self.bytesList: | |
| return False | |
| byteString = self.bytesList.popleft() | |
| unicodeCharString = codecs.decode(byteString, self.encoding) | |
| self.consumeCharacterFromInput(unicodeCharString) | |
| # if post-condition(s) return False, AssertionError is raised: | |
| assert len(self.bytesList) == sizeOfQueue - 1, "parser: queue of bytes has entered an invalid state - invalid state encountered" | |
| return True | |
| def consumeCharacterFromInput (self, charString): | |
| # if pre-condition returns False, AssertionError is raised: | |
| assert type(charString) == str and len(charString) == 1, "`charString` argument is not a character string - invalid argument encountered" | |
| if charString == "<": | |
| self.FLAG_IN_TAG = True | |
| if self.TOKEN_TEXT != "": | |
| text = self.TOKEN_TEXT | |
| self.add_text(text) | |
| self.TOKEN_TEXT = "" | |
| elif charString == ">": | |
| self.FLAG_IN_TAG = False | |
| MAX_BACKTRACK_COUNT = 10 | |
| BACKTRACK_COUNT = 0 | |
| self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.lower() | |
| # remove trailing solidus for both self-closing tags and malformed closing tags | |
| if self.TOKEN_TAG_WITH_ATTRS.endswith("/"): | |
| self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS[0:-1] | |
| self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.strip() | |
| # ignore comment tags and the doctype tag (for now...) | |
| if self.TOKEN_TAG_WITH_ATTRS.startswith("!"): | |
| self.TOKEN_TAG_WITH_ATTRS = "" | |
| return | |
| # read in the tag at the top of the stack of unfinished nodes | |
| newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag | |
| if newestUnfinishedTag == "script" or newestUnfinishedTag == "style": | |
| if not self.TOKEN_TAG_WITH_ATTRS.startswith("/script") or not self.TOKEN_TAG_WITH_ATTRS.startswith("/style"): | |
| self.TOKEN_TAG_WITH_ATTRS = "" | |
| return | |
| tag = "" | |
| subtokens = [] | |
| if " " in self.TOKEN_TAG_WITH_ATTRS: | |
| subtokens = self.TOKEN_TAG_WITH_ATTRS.split(None, 1) | |
| tag = subtokens[0] | |
| else: | |
| tag = self.TOKEN_TAG_WITH_ATTRS | |
| if not tag.startswith("/"): | |
| if tag == "li" and not newestUnfinishedTag in ["ul", "ol"]: | |
| self.TOKEN_TAG_WITH_ATTRS = "" | |
| return | |
| if tag == newestUnfinishedTag: | |
| self.add_tag("/" + newestUnfinishedTag, {}) | |
| attributes = self.parseTagAttributesString(subtokens[1] if len(subtokens) > 1 else "") | |
| self.add_tag(tag, attributes) | |
| else: | |
| lastParentTagProcessed = "" | |
| tagWithoutSlash = tag[1:] | |
| while newestUnfinishedTag != None and tagWithoutSlash != newestUnfinishedTag: | |
| if BACKTRACK_COUNT >= MAX_BACKTRACK_COUNT: | |
| break | |
| parentNode = self.unfinishedNodesList.pop() | |
| if (self.unfinishedNodesList[-1]).tag != tagWithoutSlash: | |
| self.unfinishedNodesList.append(parentNode) | |
| self.add_tag("/" + parentNode.tag, {}) | |
| else: | |
| self.unfinishedNodesList.append(parentNode) | |
| lastParentTagProcessed = parentNode.tag | |
| BACKTRACK_COUNT += 1 | |
| newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag | |
| if lastParentTagProcessed != "": | |
| self.add_tag("/" + lastParentTagProcessed, {}) | |
| if BACKTRACK_COUNT == 0: | |
| self.add_tag(tag, {}) | |
| if lastParentTagProcessed != "": | |
| self.add_tag(lastParentTagProcessed, {}) | |
| BACKTRACK_COUNT = 0 | |
| self.TOKEN_TAG_WITH_ATTRS = "" | |
| else: | |
| if self.FLAG_IN_TAG: | |
| self.TOKEN_TAG_WITH_ATTRS += charString | |
| else: | |
| if charString.isspace() and self.TOKEN_TEXT != "": | |
| self.TOKEN_TEXT += charString | |
| def parseTagAttributesString (self, attributesString): | |
| attributes = {} | |
| valueCache = "" | |
| nameCache = "" | |
| TOKEN_ATTR_NAME = "" | |
| TOKEN_ATTR_VALUE = "" | |
| TOKEN_ATTR_QUOTE = "" | |
| ESCAPE_SENTINEL_ACTIVE = False | |
| #if pre-condition(s) return False, AssertionError is raised: | |
| assert type(attributesString) == str, "`attributeString` argument is not a string of html markup - invalid argument encountered" | |
| assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered" | |
| if len(attributesString) == 0: | |
| return attributes | |
| for char in attributesString: | |
| if char.isaplha(): | |
| if not self.FLAG_IN_ATTR_VALUE: | |
| TOKEN_ATTR_NAME += char | |
| else: | |
| TOKEN_ATTR_VALUE += char | |
| elif char == "=": | |
| nameCache = TOKEN_ATTR_NAME | |
| TOKEN_ATTR_NAME = "" | |
| self.FLAG_IN_ATTR_VALUE = True | |
| elif char == "\\": | |
| ESCAPE_SENTINEL_ACTIVE = True | |
| TOKEN_ATTR_VALUE += char | |
| elif char == "'" or char == "\"": | |
| if not self.FLAG_IN_ATTR_VALUE: | |
| continue | |
| if TOKEN_ATTR_VALUE == "": | |
| TOKEN_ATTR_QUOTE = char | |
| else: | |
| if char != TOKEN_ATTR_QUOTE: | |
| TOKEN_ATTR_VALUE += char | |
| elif ESCAPE_SENTINEL_ACTIVE: | |
| ESCAPE_SENTINEL_ACTIVE = False | |
| TOKEN_ATTR_VALUE += char | |
| else: | |
| valueCache = TOKEN_ATTR_VALUE.strip() | |
| TOKEN_ATTR_VALUE = "" | |
| TOKEN_ATTR_QUOTE = "" | |
| self.FLAG_IN_ATTR_VALUE = False | |
| elif char.isspace(): | |
| if self.FLAG_IN_ATTR_VALUE: | |
| if TOKEN_ATTR_VALUE != "": | |
| valueCache = TOKEN_ATTR_VALUE.strip() | |
| TOKEN_ATTR_VALUE = "" | |
| TOKEN_ATTR_QUOTE = "" | |
| self.FLAG_IN_ATTR_VALUE = False | |
| else: | |
| TOKEN_ATTR_VALUE += char | |
| else: | |
| if nameCache != "": | |
| if nameCache in attributes: | |
| continue | |
| attributes[nameCache] = valueCache | |
| nameCache = "" | |
| valueCache = "" | |
| elif TOKEN_ATTR_NAME != "": | |
| if TOKEN_ATTR_NAME in attributes: | |
| continue | |
| attributes[TOKEN_ATTR_NAME] = valueCache | |
| TOKEN_ATTR_NAME = "" | |
| else: | |
| if not self.FLAG_IN_ATTR_VALUE: | |
| continue | |
| else: | |
| TOKEN_ATTR_VALUE += char | |
| if TOKEN_ATTR_VALUE != "": | |
| if nameCache != "": | |
| if nameCache not in attributes: | |
| attributes[nameCache] = TOKEN_ATTR_VALUE | |
| nameCache = "" | |
| TOKEN_ATTR_VALUE = "" | |
| elif TOKEN_ATTR_NAME != "": | |
| if TOKEN_ATTR_NAME not in attributes: | |
| attributes[TOKEN_ATTR_NAME] = TOKEN_ATTR_VALUE | |
| TOKEN_ATTR_NAME = "" | |
| valueCache = "" | |
| # if post-condition(s) return False, AssertionError is raised: | |
| assert nameCache == "", "variables used to cache html attribute names should be returned to their initial state - termination failure encountered" | |
| assert valueCache == "", "variables used to cache html attribute values should be returned to their initial state - termination failure encountered" | |
| assert TOKEN_ATTR_QUOTE == "", "html attributes quote sentinel helper must be cleared after parsing of attributes is completed - termination failure encountered" | |
| assert TOKEN_ATTR_NAME == "", "all html attribute names should be collected - termination failure encountered" | |
| assert TOKEN_ATTR_VALUE == "", "all html attribute values should be collected - termination failure encountered" | |
| assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point - invalid state encountered" | |
| return attributes | |
| def parseHTMLString(self, htmlString): | |
| # if pre-condition(s) return False, AssertionError is raised: | |
| assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered" | |
| assert type(htmlString) == str, "`parseHTMLString(...)` method can only parse a string of html markup - invalid argument encountered" | |
| for char in htmlString: | |
| self.consumeCharacterFromInput(char) | |
| # if post-condition(s) return False, AssertionError is raised: | |
| assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (EOF) - invalid state encountered" | |
| assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point (EOF) - invalid state encountered" | |
| return self.finish() | |
| def finish(self): | |
| if not self.FLAG_IN_TAG and self.TOKEN_TEXT: | |
| text = self.TOKEN_TEXT | |
| self.add_text(text) | |
| self.TOKEN_TEXT = "" | |
| if len(self.unfinishedNodesList) == 0: | |
| self.add_implicit_tags(None) | |
| while len(self.unfinishedNodesList) > 1: | |
| childNode = self.unfinishedNodesList.pop() | |
| parentNode = self.unfinishedNodesList[-1] | |
| parentNode.children.append(childNode) | |
| rootNode = self.unfinishedNodesList.pop() | |
| #if post-condition(s) return False, AssertionError is raised: | |
| assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (EOL) - invalid state encountered" | |
| assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (EOL) - invalid state encountered" | |
| return rootNode | |
| def print_tree(node, indent=0): | |
| print(" " * indent, node) | |
| for child in node.children: | |
| print_tree(child, indent + 2) | |
| """ | |
| @USAGE: Fetch the HTML string via a HTTP GET request. | |
| Parse the HTML string and print the tree nodes. | |
| """ | |
| response = requests.get(sys.argv[1]) | |
| body = response.text | |
| rootNode = HTMLParser().parseHTMLString(body) | |
| print_tree(rootNode) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment