Skip to content

Instantly share code, notes, and snippets.

@chapmanjacobd
Created April 19, 2024 09:45
Show Gist options
  • Select an option

  • Save chapmanjacobd/6cdb6b242e91527c3d5cd87f90d9f03d to your computer and use it in GitHub Desktop.

Select an option

Save chapmanjacobd/6cdb6b242e91527c3d5cd87f90d9f03d to your computer and use it in GitHub Desktop.
class HtmlList:
def __init__(self):
self.tree = []
self.xpath = []
def resolve_parents(self, el) -> list:
if el.parent is None:
return self.xpath
else:
self.xpath.append(el.parent.name)
self.resolve_parents(el.parent)
return self.xpath
def add_children(self, el, depth):
if isinstance(el, element.Tag):
path_parts = list(reversed(self.resolve_parents(el)))
self.build_tree_node(self.tree, path_parts, el, depth)
self.xpath.clear()
depth += 1
for a in el.children:
self.add_children(a, depth)
def build_tree_node(self, sub_tree, path_parts, el: element.Tag, depth):
print('sub_tree', sub_tree)
if len(self.xpath) <= 1: # top-level
self.tree.append(
{
"name": "[document]",
"depth": 0,
"children": [{"name": el.name, "depth": 1, 'children': []}],
}
)
else:
path_part, remaining_path_parts = path_parts[0], path_parts[1:]
leaf = next((child for child in sub_tree if child['name'] == path_part), None)
# print(type(el), [type(child) for child in el.children], el)
if leaf is not None:
leaf_children = leaf['children']
if remaining_path_parts:
self.build_tree_node(leaf_children, remaining_path_parts, el, depth)
else: # last path_part
if el.name not in ['script', 'style']:
data = {"name": el.name, "children": [], "depth": depth, "path": ">".join(reversed(self.xpath))}
if any(isinstance(child, element.NavigableString) for child in el.children):
data['text'] = extract_text.un_paragraph(el)
for attr in ('title', 'href', 'src', 'alt'):
if attr in el.attrs:
data[attr] = el.attrs[attr]
print('append', data)
leaf_children.append(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment