isocroft · November 17, 2025 12:55
diff --git a/htmlparser.py b/htmlparser.py
 import sys
 import codecs
 import requests

 from collections import deque


 class Element:
    def __init__(self, tag, attributes, parentNode):
      	# if pre-condition(s) return False, AssertionError is raised:
     	assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
     	assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"
     	assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"

        self.tag = tag
        self.attributes = attributes
        self.children = []
        self.parentNode = parentNode
    
    def __repr__(self):
        return "<" + self.tag + ">"


 class TextNode:
    def __init__(self, text, parentNode):
    	# if pre-condition(s) return False, AssertionError is raised:
      	assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
      	assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"

        self.text = text
        self.children = []
        self.parentNode = parentNode
    
    def __repr__(self):
        return repr(self.text)




 class HTMLParser:
    SELF_CLOSING_TAGS = [
    	"keygen", "area", "base", "br", "col", "embed", "hr", "img", "input",
    	"link", "meta", "param", "source", "track", "wbr"
    ]
    HEAD_TAGS = [
        "base", "basefont", "bgsound", "noscript",
        "link", "meta", "title", "style", "script",
    ]

    def __init__(self, encoding=None):
 		if self.encoding is None:
 			self.encoding = 'utf-8'
 		elif type(encoding) == str:
 			self.encoding = encoding
 		else:
 			self.encoding = 'utf-8'

 			self.bytesList = deque(bytearray(20)) # A Byte Queue (size=20)
 			self.unfinishedNodesList = [] # A Node Stack

 		# parser flags/sentinels
 			self.FLAG_IN_ATTR_VALUE = False 
 			self.FLAG_IN_TAG = False

 		# parser token buffers
 			self.TOKEN_TEXT = ""
 			self.TOKEN_TAG_WITH_ATTRS = ""

 		# if post-condition(s) return False, AssertionError is raised:
 		assert len(self.bytesList) == 20, "`self.bytesList` is not a byte queue of size 20 - invalid setup encountered"
 		assert type(self.encoding) == str, "`self.encoding` is not a string - invalid setup encountered"
 		# assert self.FLAG_IN_TAG == False, "parser should not be collecting tag from input string/bytes at this point (object instantiation) - invalid setup encountered"
 		# assert self.FLAG_IN_ATTR_VALUE == False, "parser should not be collecting tag attributes from input string/bytes at this point (object instantiation) - invalid setup encountered"
 		# assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (object instantiation) - invalid setup encountered"
 		# assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (object instantiation) - invalid setup encountered""
 		# assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (object instantiation) - invalid setup encountered"

    def add_implicit_tags(self, tag):

 		# if pre-condition returns False, AssertionError is raised:
 		assert tag == None or type(tag) == str, "`tag` argument is not a string or 'None' - invalid argument encountered"

        while True:
            # read in the tag at the top of the stack of unfinished nodes      
            newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag

            if newestUnfinishedTag == None:
 				if tag not in ["html"]:
 	    	    	self.add_tag("html", {})
            elif newestUnfinishedTag == "html" and len(self.unfinishedNodesList) == 1:
 	    		if tag not in ["head", "body", "/html"] and tag in self.HEAD_TAGS:
 		    		self.add_tag("head", {})
 	    		if tag not in ["body", "/body", "/html"] and tag not in self.HEAD_TAGS:
 		    		self.add_tag("body", {})
            elif newestUnfinishedTag == "head" and len(self.unfinishedNodesList) == 2:
 	        	if tag not in ["/head"] and tag not in self.HEAD_TAGS:
                    self.add_tag("/head", {})
            elif newestUnfinishedTag == "body" and len(self.unfinishedNodesList) == 2:
 	    		break

 		# if post-condition(s) return False, AssertionError is raised:
 		assert len(self.unfinishedNodesList) > 0, "parser: stack of open tags should contain all injected implicit tags - invalid state encountered"
 		assert (self.unfinishedNodesList[-1]).tag == "body", "parser: stack of open tags should contain the node for <body> tag at the top of the stack - invalid state encountered"

    def add_text(self, text):
 		sizeOfStack = len(self.unfinishedNodesList)

 		# if pre-condition(s) return False, AssertionError is raised:
        assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
 		assert len(text) > 0, "`text` argument is a string but contains nothing - invalid argument encountered"
 	
 		if text.isspace():
 			return False

 		if len(self.unfinishedNodesList) == 0:
 				self.add_implicit_tags(None)

 			parentNode = self.unfinishedNodesList[-1]
 			childNode = Element("#text", {}, parentNode)
 			childNode.children.append(
 				Text(text, childNode)
 			)
 			parentNode.children.append(childNode)

 		# if post-condition returns False, AssertionError is raised:
 		assert len(self.unfinishedNodesList) == sizeOfStack, "parser: stack of open tags has entered an invalid state - invalid state encountered"

    def add_tag(self, tag, attributes):
 	
 		# if pre-condition(s) return False, AssertionError is raised:
 		assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
 		assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"

 		# Only add implicit tags when the tag to be added is not the <html> tag and the stack of open tags is empty
 		if len(self.unfinishedNodesList) == 0 and tag != "html":
 			   self.add_implicit_tags(tag)

 		# Deal with closing tags for non-self closing tags (e.g. head, body, span, div)
        if tag.startswith("/"):
            if len(self.unfinishedNodesList) == 1:
                return False

            childNode = self.unfinishedNodesList.pop()
            parentNode = self.unfinishedNodesList[-1]
            parentNode.children.append(childNode)
 	    	return True
        elif tag in self.SELF_CLOSING_TAGS:
            parentNode = self.unfinishedNodesList[-1]
            childNode = Element(tag, attributes, parentNode)
            parentNode.children.append(childNode)
 	    	return True
        else:
            parentNode = self.unfinishedNodesList[-1] if len(self.unfinishedNodesList) > 0 else None
            childNode = Element(tag, attributes, parentNode)
            self.unfinishedNodeList.append(childNode)
 	    	return True

    def parseSingleByteInTurn (self):
 		sizeOfQueue = len(self.bytesList)

 		# if pre-condition returns False, AssertionError is raised:
 		assert sizeOfQueue > 0, "`parseSingleByteInTurn()` method should not be called without one or more bytes in the queue - invalid callee invocation"

 		if not self.bytesList:
 	    	return False

    	byteString = self.bytesList.popleft()

    	unicodeCharString = codecs.decode(byteString, self.encoding)
    	self.consumeCharacterFromInput(unicodeCharString)
 	
 		# if post-condition(s) return False, AssertionError is raised:
 		assert len(self.bytesList) == sizeOfQueue - 1, "parser: queue of bytes has entered an invalid state - invalid state encountered"

 		return True

    def consumeCharacterFromInput (self, charString):
 	
 		# if pre-condition returns False, AssertionError is raised:
 		assert type(charString) == str and len(charString) == 1, "`charString` argument is not a character string - invalid argument encountered"

     	if charString == "<":
            self.FLAG_IN_TAG = True

            if self.TOKEN_TEXT != "":
 	        	text = self.TOKEN_TEXT
 				self.add_text(text)
                self.TOKEN_TEXT = ""
        	elif charString == ">":
            	self.FLAG_IN_TAG = False

 			MAX_BACKTRACK_COUNT = 10
 	    	BACKTRACK_COUNT = 0

 	    	self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.lower()

            # remove trailing solidus for both self-closing tags and malformed closing tags
            if self.TOKEN_TAG_WITH_ATTRS.endswith("/"):
                self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS[0:-1]
 		    
            self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.strip()

            # ignore comment tags and the doctype tag (for now...)
 	    	if self.TOKEN_TAG_WITH_ATTRS.startswith("!"):
 				self.TOKEN_TAG_WITH_ATTRS = ""
 				return

            # read in the tag at the top of the stack of unfinished nodes      
            newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag

            if newestUnfinishedTag == "script" or newestUnfinishedTag == "style":
                if not self.TOKEN_TAG_WITH_ATTRS.startswith("/script") or not self.TOKEN_TAG_WITH_ATTRS.startswith("/style"):
                    self.TOKEN_TAG_WITH_ATTRS = ""
                    return

 			tag = ""
 			subtokens = [] 

 			if " " in self.TOKEN_TAG_WITH_ATTRS:
 				subtokens = self.TOKEN_TAG_WITH_ATTRS.split(None, 1)
 				tag = subtokens[0]
 			else:
 				tag = self.TOKEN_TAG_WITH_ATTRS


 			if not tag.startswith("/"):
 				if tag == "li" and not newestUnfinishedTag in ["ul", "ol"]:
 				self.TOKEN_TAG_WITH_ATTRS = ""
 				return
 				if tag == newestUnfinishedTag:
 				self.add_tag("/" + newestUnfinishedTag, {})

 				attributes = self.parseTagAttributesString(subtokens[1] if len(subtokens) > 1 else "")
 				self.add_tag(tag, attributes)   
 			else:
 				lastParentTagProcessed = ""
 				tagWithoutSlash = tag[1:]

 				while newestUnfinishedTag != None and tagWithoutSlash != newestUnfinishedTag:
 		   		    if BACKTRACK_COUNT >= MAX_BACKTRACK_COUNT:
 	    	       		break

 				   parentNode = self.unfinishedNodesList.pop()

 				   if (self.unfinishedNodesList[-1]).tag != tagWithoutSlash:
 					   self.unfinishedNodesList.append(parentNode)
 					   self.add_tag("/" + parentNode.tag, {})
 				   else:
 					   self.unfinishedNodesList.append(parentNode)
 					   lastParentTagProcessed = parentNode.tag

 				   BACKTRACK_COUNT += 1
 				   newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag



 					if lastParentTagProcessed != "":
 						self.add_tag("/" + lastParentTagProcessed, {})

 					if BACKTRACK_COUNT == 0:
 						self.add_tag(tag, {})

 					if lastParentTagProcessed != "":
 						self.add_tag(lastParentTagProcessed, {})

 				BACKTRACK_COUNT = 0
 				self.TOKEN_TAG_WITH_ATTRS = ""
        else:
            if self.FLAG_IN_TAG:
                self.TOKEN_TAG_WITH_ATTRS += charString
            else:
 				if charString.isspace() and self.TOKEN_TEXT != "":
                    self.TOKEN_TEXT += charString
 
    def parseTagAttributesString (self, attributesString):
        attributes = {}
        
        valueCache = ""
        nameCache = ""

        TOKEN_ATTR_NAME = ""
        TOKEN_ATTR_VALUE = ""
        TOKEN_ATTR_QUOTE = ""

        ESCAPE_SENTINEL_ACTIVE = False
            
        #if pre-condition(s) return False, AssertionError is raised:
        assert type(attributesString) == str, "`attributeString` argument is not a string of html markup - invalid argument encountered"
        assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"

        if len(attributesString) == 0:
            return attributes

        for char in attributesString:
            if char.isaplha():
                if not self.FLAG_IN_ATTR_VALUE:
                    TOKEN_ATTR_NAME += char
                else:
                    TOKEN_ATTR_VALUE += char
            elif char == "=":
                nameCache = TOKEN_ATTR_NAME
                TOKEN_ATTR_NAME = ""
                self.FLAG_IN_ATTR_VALUE	= True
            elif char == "\\":
                ESCAPE_SENTINEL_ACTIVE = True
                TOKEN_ATTR_VALUE += char
            elif char == "'" or char == "\"":
                if not self.FLAG_IN_ATTR_VALUE:
                    continue

                if TOKEN_ATTR_VALUE == "":
                    TOKEN_ATTR_QUOTE = char
                else:
                    if char != TOKEN_ATTR_QUOTE:
                        TOKEN_ATTR_VALUE += char
 		    elif ESCAPE_SENTINEL_ACTIVE:
                        ESCAPE_SENTINEL_ACTIVE = False
                        TOKEN_ATTR_VALUE += char
                    else:
                        valueCache = TOKEN_ATTR_VALUE.strip()
                        TOKEN_ATTR_VALUE = ""
                        TOKEN_ATTR_QUOTE = ""
                        
                        self.FLAG_IN_ATTR_VALUE = False           
            elif char.isspace():
                if self.FLAG_IN_ATTR_VALUE:
                    if TOKEN_ATTR_VALUE != "":
                        valueCache = TOKEN_ATTR_VALUE.strip()
                        TOKEN_ATTR_VALUE = ""
                        TOKEN_ATTR_QUOTE = ""
                            
                        self.FLAG_IN_ATTR_VALUE = False
                    else:
                        TOKEN_ATTR_VALUE += char
                else:
                    if nameCache != "":
                        if nameCache in attributes:
                            continue

                        attributes[nameCache] = valueCache
                        nameCache = ""
                        valueCache = ""
                    elif TOKEN_ATTR_NAME != "":
                        if TOKEN_ATTR_NAME in attributes:
                            continue

                        attributes[TOKEN_ATTR_NAME] = valueCache
                        TOKEN_ATTR_NAME = ""
            else:
                if not self.FLAG_IN_ATTR_VALUE:
                    continue
                else:
                    TOKEN_ATTR_VALUE += char
        

        if TOKEN_ATTR_VALUE != "":
            if nameCache != "":
                if nameCache not in attributes:
                    attributes[nameCache] = TOKEN_ATTR_VALUE

                nameCache = ""
                TOKEN_ATTR_VALUE = ""
            elif TOKEN_ATTR_NAME != "":
                if TOKEN_ATTR_NAME not in attributes:
                    attributes[TOKEN_ATTR_NAME] = TOKEN_ATTR_VALUE

 		TOKEN_ATTR_NAME = ""

    	valueCache = ""

        # if post-condition(s) return False, AssertionError is raised:
    	assert nameCache == "", "variables used to cache html attribute names should be returned to their initial state - termination failure encountered"
    	assert valueCache == "", "variables used to cache html attribute values should be returned to their initial state - termination failure encountered"
    	assert TOKEN_ATTR_QUOTE == "", "html attributes quote sentinel helper must be cleared after parsing of attributes is completed - termination failure encountered"
        assert TOKEN_ATTR_NAME == "", "all html attribute names should be collected - termination failure encountered"
        assert TOKEN_ATTR_VALUE == "", "all html attribute values should be collected - termination failure encountered"
        assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point - invalid state encountered"

        return attributes

    def parseHTMLString(self, htmlString):
        # if pre-condition(s) return False, AssertionError is raised:
        assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"
        assert type(htmlString) == str, "`parseHTMLString(...)` method can only parse a string of html markup - invalid argument encountered"

        for char in htmlString:
            self.consumeCharacterFromInput(char)

    	# if post-condition(s) return False, AssertionError is raised:
    	assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (EOF) - invalid state encountered"
        assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point (EOF) - invalid state encountered"

        return self.finish()

    def finish(self):
        if not self.FLAG_IN_TAG and self.TOKEN_TEXT:
            text = self.TOKEN_TEXT
            self.add_text(text)
            self.TOKEN_TEXT = ""

        if len(self.unfinishedNodesList) == 0:
            self.add_implicit_tags(None)

            while len(self.unfinishedNodesList) > 1:
                childNode = self.unfinishedNodesList.pop()
                parentNode = self.unfinishedNodesList[-1]
                parentNode.children.append(childNode)

        rootNode = self.unfinishedNodesList.pop()
 	
    	#if post-condition(s) return False, AssertionError is raised:
    	assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (EOL) - invalid state encountered"
    	assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (EOL) - invalid state encountered"
 	
    	return rootNode




 def print_tree(node, indent=0):
    print(" " * indent, node)

    for child in node.children:
        print_tree(child, indent + 2)



 """
 	@USAGE: Fetch the HTML string via a HTTP GET request.
 			Parse the HTML string and print the tree nodes.
 """
 response = requests.get(sys.argv[1])
 body = response.text
 rootNode = HTMLParser().parseHTMLString(body)

 print_tree(rootNode)
	import sys
	import codecs
	import requests

	from collections import deque


	class Element:
	def __init__(self, tag, attributes, parentNode):
	# if pre-condition(s) return False, AssertionError is raised:
	assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
	assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"
	assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"

	self.tag = tag
	self.attributes = attributes
	self.children = []
	self.parentNode = parentNode

	def __repr__(self):
	return "<" + self.tag + ">"


	class TextNode:
	def __init__(self, text, parentNode):
	# if pre-condition(s) return False, AssertionError is raised:
	assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
	assert isinstance(parentNode, Element), "`parentNode` argument is not an isntance of `Element` - invalid argument encountered"

	self.text = text
	self.children = []
	self.parentNode = parentNode

	def __repr__(self):
	return repr(self.text)




	class HTMLParser:
	SELF_CLOSING_TAGS = [
	"keygen", "area", "base", "br", "col", "embed", "hr", "img", "input",
	"link", "meta", "param", "source", "track", "wbr"
	]
	HEAD_TAGS = [
	"base", "basefont", "bgsound", "noscript",
	"link", "meta", "title", "style", "script",
	]

	def __init__(self, encoding=None):
	if self.encoding is None:
	self.encoding = 'utf-8'
	elif type(encoding) == str:
	self.encoding = encoding
	else:
	self.encoding = 'utf-8'

	self.bytesList = deque(bytearray(20)) # A Byte Queue (size=20)
	self.unfinishedNodesList = [] # A Node Stack

	# parser flags/sentinels
	self.FLAG_IN_ATTR_VALUE = False
	self.FLAG_IN_TAG = False

	# parser token buffers
	self.TOKEN_TEXT = ""
	self.TOKEN_TAG_WITH_ATTRS = ""

	# if post-condition(s) return False, AssertionError is raised:
	assert len(self.bytesList) == 20, "`self.bytesList` is not a byte queue of size 20 - invalid setup encountered"
	assert type(self.encoding) == str, "`self.encoding` is not a string - invalid setup encountered"
	# assert self.FLAG_IN_TAG == False, "parser should not be collecting tag from input string/bytes at this point (object instantiation) - invalid setup encountered"
	# assert self.FLAG_IN_ATTR_VALUE == False, "parser should not be collecting tag attributes from input string/bytes at this point (object instantiation) - invalid setup encountered"
	# assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (object instantiation) - invalid setup encountered"
	# assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (object instantiation) - invalid setup encountered""
	# assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (object instantiation) - invalid setup encountered"

	def add_implicit_tags(self, tag):

	# if pre-condition returns False, AssertionError is raised:
	assert tag == None or type(tag) == str, "`tag` argument is not a string or 'None' - invalid argument encountered"

	while True:
	# read in the tag at the top of the stack of unfinished nodes
	newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag

	if newestUnfinishedTag == None:
	if tag not in ["html"]:
	self.add_tag("html", {})
	elif newestUnfinishedTag == "html" and len(self.unfinishedNodesList) == 1:
	if tag not in ["head", "body", "/html"] and tag in self.HEAD_TAGS:
	self.add_tag("head", {})
	if tag not in ["body", "/body", "/html"] and tag not in self.HEAD_TAGS:
	self.add_tag("body", {})
	elif newestUnfinishedTag == "head" and len(self.unfinishedNodesList) == 2:
	if tag not in ["/head"] and tag not in self.HEAD_TAGS:
	self.add_tag("/head", {})
	elif newestUnfinishedTag == "body" and len(self.unfinishedNodesList) == 2:
	break

	# if post-condition(s) return False, AssertionError is raised:
	assert len(self.unfinishedNodesList) > 0, "parser: stack of open tags should contain all injected implicit tags - invalid state encountered"
	assert (self.unfinishedNodesList[-1]).tag == "body", "parser: stack of open tags should contain the node for <body> tag at the top of the stack - invalid state encountered"

	def add_text(self, text):
	sizeOfStack = len(self.unfinishedNodesList)

	# if pre-condition(s) return False, AssertionError is raised:
	assert type(text) == str, "`text` argument is not a string - invalid argument encountered"
	assert len(text) > 0, "`text` argument is a string but contains nothing - invalid argument encountered"

	if text.isspace():
	return False

	if len(self.unfinishedNodesList) == 0:
	self.add_implicit_tags(None)

	parentNode = self.unfinishedNodesList[-1]
	childNode = Element("#text", {}, parentNode)
	childNode.children.append(
	Text(text, childNode)
	)
	parentNode.children.append(childNode)

	# if post-condition returns False, AssertionError is raised:
	assert len(self.unfinishedNodesList) == sizeOfStack, "parser: stack of open tags has entered an invalid state - invalid state encountered"

	def add_tag(self, tag, attributes):

	# if pre-condition(s) return False, AssertionError is raised:
	assert type(tag) == str, "`tag` argument is not a string - invalid argument encountered"
	assert type(attributes) == dict, "`attributes` argument is not a dictionary - invalid argument encountered"

	# Only add implicit tags when the tag to be added is not the <html> tag and the stack of open tags is empty
	if len(self.unfinishedNodesList) == 0 and tag != "html":
	self.add_implicit_tags(tag)

	# Deal with closing tags for non-self closing tags (e.g. head, body, span, div)
	if tag.startswith("/"):
	if len(self.unfinishedNodesList) == 1:
	return False

	childNode = self.unfinishedNodesList.pop()
	parentNode = self.unfinishedNodesList[-1]
	parentNode.children.append(childNode)
	return True
	elif tag in self.SELF_CLOSING_TAGS:
	parentNode = self.unfinishedNodesList[-1]
	childNode = Element(tag, attributes, parentNode)
	parentNode.children.append(childNode)
	return True
	else:
	parentNode = self.unfinishedNodesList[-1] if len(self.unfinishedNodesList) > 0 else None
	childNode = Element(tag, attributes, parentNode)
	self.unfinishedNodeList.append(childNode)
	return True

	def parseSingleByteInTurn (self):
	sizeOfQueue = len(self.bytesList)

	# if pre-condition returns False, AssertionError is raised:
	assert sizeOfQueue > 0, "`parseSingleByteInTurn()` method should not be called without one or more bytes in the queue - invalid callee invocation"

	if not self.bytesList:
	return False

	byteString = self.bytesList.popleft()

	unicodeCharString = codecs.decode(byteString, self.encoding)
	self.consumeCharacterFromInput(unicodeCharString)

	# if post-condition(s) return False, AssertionError is raised:
	assert len(self.bytesList) == sizeOfQueue - 1, "parser: queue of bytes has entered an invalid state - invalid state encountered"

	return True

	def consumeCharacterFromInput (self, charString):

	# if pre-condition returns False, AssertionError is raised:
	assert type(charString) == str and len(charString) == 1, "`charString` argument is not a character string - invalid argument encountered"

	if charString == "<":
	self.FLAG_IN_TAG = True

	if self.TOKEN_TEXT != "":
	text = self.TOKEN_TEXT
	self.add_text(text)
	self.TOKEN_TEXT = ""
	elif charString == ">":
	self.FLAG_IN_TAG = False

	MAX_BACKTRACK_COUNT = 10
	BACKTRACK_COUNT = 0

	self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.lower()

	# remove trailing solidus for both self-closing tags and malformed closing tags
	if self.TOKEN_TAG_WITH_ATTRS.endswith("/"):
	self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS[0:-1]

	self.TOKEN_TAG_WITH_ATTRS = self.TOKEN_TAG_WITH_ATTRS.strip()

	# ignore comment tags and the doctype tag (for now...)
	if self.TOKEN_TAG_WITH_ATTRS.startswith("!"):
	self.TOKEN_TAG_WITH_ATTRS = ""
	return

	# read in the tag at the top of the stack of unfinished nodes
	newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag

	if newestUnfinishedTag == "script" or newestUnfinishedTag == "style":
	if not self.TOKEN_TAG_WITH_ATTRS.startswith("/script") or not self.TOKEN_TAG_WITH_ATTRS.startswith("/style"):
	self.TOKEN_TAG_WITH_ATTRS = ""
	return

	tag = ""
	subtokens = []

	if " " in self.TOKEN_TAG_WITH_ATTRS:
	subtokens = self.TOKEN_TAG_WITH_ATTRS.split(None, 1)
	tag = subtokens[0]
	else:
	tag = self.TOKEN_TAG_WITH_ATTRS


	if not tag.startswith("/"):
	if tag == "li" and not newestUnfinishedTag in ["ul", "ol"]:
	self.TOKEN_TAG_WITH_ATTRS = ""
	return
	if tag == newestUnfinishedTag:
	self.add_tag("/" + newestUnfinishedTag, {})

	attributes = self.parseTagAttributesString(subtokens[1] if len(subtokens) > 1 else "")
	self.add_tag(tag, attributes)
	else:
	lastParentTagProcessed = ""
	tagWithoutSlash = tag[1:]

	while newestUnfinishedTag != None and tagWithoutSlash != newestUnfinishedTag:
	if BACKTRACK_COUNT >= MAX_BACKTRACK_COUNT:
	break

	parentNode = self.unfinishedNodesList.pop()

	if (self.unfinishedNodesList[-1]).tag != tagWithoutSlash:
	self.unfinishedNodesList.append(parentNode)
	self.add_tag("/" + parentNode.tag, {})
	else:
	self.unfinishedNodesList.append(parentNode)
	lastParentTagProcessed = parentNode.tag

	BACKTRACK_COUNT += 1
	newestUnfinishedTag = None if len(self.unfinishedNodesList) == 0 else (self.unfinishedNodesList[-1]).tag



	if lastParentTagProcessed != "":
	self.add_tag("/" + lastParentTagProcessed, {})

	if BACKTRACK_COUNT == 0:
	self.add_tag(tag, {})

	if lastParentTagProcessed != "":
	self.add_tag(lastParentTagProcessed, {})

	BACKTRACK_COUNT = 0
	self.TOKEN_TAG_WITH_ATTRS = ""
	else:
	if self.FLAG_IN_TAG:
	self.TOKEN_TAG_WITH_ATTRS += charString
	else:
	if charString.isspace() and self.TOKEN_TEXT != "":
	self.TOKEN_TEXT += charString

	def parseTagAttributesString (self, attributesString):
	attributes = {}

	valueCache = ""
	nameCache = ""

	TOKEN_ATTR_NAME = ""
	TOKEN_ATTR_VALUE = ""
	TOKEN_ATTR_QUOTE = ""

	ESCAPE_SENTINEL_ACTIVE = False

	#if pre-condition(s) return False, AssertionError is raised:
	assert type(attributesString) == str, "`attributeString` argument is not a string of html markup - invalid argument encountered"
	assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"

	if len(attributesString) == 0:
	return attributes

	for char in attributesString:
	if char.isaplha():
	if not self.FLAG_IN_ATTR_VALUE:
	TOKEN_ATTR_NAME += char
	else:
	TOKEN_ATTR_VALUE += char
	elif char == "=":
	nameCache = TOKEN_ATTR_NAME
	TOKEN_ATTR_NAME = ""
	self.FLAG_IN_ATTR_VALUE = True
	elif char == "\\":
	ESCAPE_SENTINEL_ACTIVE = True
	TOKEN_ATTR_VALUE += char
	elif char == "'" or char == "\"":
	if not self.FLAG_IN_ATTR_VALUE:
	continue

	if TOKEN_ATTR_VALUE == "":
	TOKEN_ATTR_QUOTE = char
	else:
	if char != TOKEN_ATTR_QUOTE:
	TOKEN_ATTR_VALUE += char
	elif ESCAPE_SENTINEL_ACTIVE:
	ESCAPE_SENTINEL_ACTIVE = False
	TOKEN_ATTR_VALUE += char
	else:
	valueCache = TOKEN_ATTR_VALUE.strip()
	TOKEN_ATTR_VALUE = ""
	TOKEN_ATTR_QUOTE = ""

	self.FLAG_IN_ATTR_VALUE = False
	elif char.isspace():
	if self.FLAG_IN_ATTR_VALUE:
	if TOKEN_ATTR_VALUE != "":
	valueCache = TOKEN_ATTR_VALUE.strip()
	TOKEN_ATTR_VALUE = ""
	TOKEN_ATTR_QUOTE = ""

	self.FLAG_IN_ATTR_VALUE = False
	else:
	TOKEN_ATTR_VALUE += char
	else:
	if nameCache != "":
	if nameCache in attributes:
	continue

	attributes[nameCache] = valueCache
	nameCache = ""
	valueCache = ""
	elif TOKEN_ATTR_NAME != "":
	if TOKEN_ATTR_NAME in attributes:
	continue

	attributes[TOKEN_ATTR_NAME] = valueCache
	TOKEN_ATTR_NAME = ""
	else:
	if not self.FLAG_IN_ATTR_VALUE:
	continue
	else:
	TOKEN_ATTR_VALUE += char


	if TOKEN_ATTR_VALUE != "":
	if nameCache != "":
	if nameCache not in attributes:
	attributes[nameCache] = TOKEN_ATTR_VALUE

	nameCache = ""
	TOKEN_ATTR_VALUE = ""
	elif TOKEN_ATTR_NAME != "":
	if TOKEN_ATTR_NAME not in attributes:
	attributes[TOKEN_ATTR_NAME] = TOKEN_ATTR_VALUE

	TOKEN_ATTR_NAME = ""

	valueCache = ""

	# if post-condition(s) return False, AssertionError is raised:
	assert nameCache == "", "variables used to cache html attribute names should be returned to their initial state - termination failure encountered"
	assert valueCache == "", "variables used to cache html attribute values should be returned to their initial state - termination failure encountered"
	assert TOKEN_ATTR_QUOTE == "", "html attributes quote sentinel helper must be cleared after parsing of attributes is completed - termination failure encountered"
	assert TOKEN_ATTR_NAME == "", "all html attribute names should be collected - termination failure encountered"
	assert TOKEN_ATTR_VALUE == "", "all html attribute values should be collected - termination failure encountered"
	assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point - invalid state encountered"

	return attributes

	def parseHTMLString(self, htmlString):
	# if pre-condition(s) return False, AssertionError is raised:
	assert self.FLAG_IN_TAG == False, "parser should be done collecting tag related data at this point - invalid state encountered"
	assert type(htmlString) == str, "`parseHTMLString(...)` method can only parse a string of html markup - invalid argument encountered"

	for char in htmlString:
	self.consumeCharacterFromInput(char)

	# if post-condition(s) return False, AssertionError is raised:
	assert self.TOKEN_TAG_WITH_ATTRS == "", "parser should not have any unparsed html attributes at this point (EOF) - invalid state encountered"
	assert self.FLAG_IN_TAG == False, "parser should still be done collecting tag related data at this point (EOF) - invalid state encountered"

	return self.finish()

	def finish(self):
	if not self.FLAG_IN_TAG and self.TOKEN_TEXT:
	text = self.TOKEN_TEXT
	self.add_text(text)
	self.TOKEN_TEXT = ""

	if len(self.unfinishedNodesList) == 0:
	self.add_implicit_tags(None)

	while len(self.unfinishedNodesList) > 1:
	childNode = self.unfinishedNodesList.pop()
	parentNode = self.unfinishedNodesList[-1]
	parentNode.children.append(childNode)

	rootNode = self.unfinishedNodesList.pop()

	#if post-condition(s) return False, AssertionError is raised:
	assert self.TOKEN_TEXT == "", "parser should have any text yet collected at this point (EOL) - invalid state encountered"
	assert len(self.unfinishedNodesList) == 0, "parser should have no more open tags on the stack at this point (EOL) - invalid state encountered"

	return rootNode




	def print_tree(node, indent=0):
	print(" " * indent, node)

	for child in node.children:
	print_tree(child, indent + 2)



	"""
	@USAGE: Fetch the HTML string via a HTTP GET request.
	Parse the HTML string and print the tree nodes.
	"""
	response = requests.get(sys.argv[1])
	body = response.text
	rootNode = HTMLParser().parseHTMLString(body)

	print_tree(rootNode)
No results found