Skip to content

Instantly share code, notes, and snippets.

@EteimZ
Created November 11, 2023 17:44
Show Gist options
  • Select an option

  • Save EteimZ/e68127f619ffc8ca74b022a3e31ead10 to your computer and use it in GitHub Desktop.

Select an option

Save EteimZ/e68127f619ffc8ca74b022a3e31ead10 to your computer and use it in GitHub Desktop.
Experimenting with the DOM in python
from html.parser import HTMLParser
class SimpleHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.dom_tree = []
def handle_starttag(self, tag, attrs):
element = {'tag': tag, 'attributes': dict(attrs), 'content': []}
if not self.dom_tree:
self.dom_tree.append(element)
else:
parent = self.dom_tree[-1]
parent['content'].append(element)
self.dom_tree.append(element)
def handle_endtag(self, tag):
self.dom_tree.pop()
def handle_data(self, data):
if data.strip():
parent = self.dom_tree[-1]
parent['content'].append(data.strip())
def parse_html(html_content):
parser = SimpleHTMLParser()
parser.feed(html_content)
return parser.dom_tree[0]
def print_dom_tree(element, indent=0):
if isinstance(element, dict):
print(' ' * indent + f"<{element['tag']} {element['attributes']}>")
for content in element['content']:
print_dom_tree(content, indent + 2)
print(' ' * indent + f"</{element['tag']}>")
elif isinstance(element, str):
print(' ' * indent + element)
html_content = """
<html>
<head>
<title>Test</title>
</head>
<body>
<h1>Parse me!</h1>
</body>
</html>
"""
dom_tree = parse_html(html_content)
print_dom_tree(dom_tree)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment