bttger · July 28, 2024 00:25
diff --git a/extract_reddit_comments.py b/extract_reddit_comments.py
 from lxml import etree


 def process_elements(element, keep_tags, depth=0):
    result = []
    for child in element:
        if child.tag in keep_tags:
            result.append((depth, child))
        result.extend(process_elements(child, keep_tags, depth + 1))
    return result


 keep_tags = ["p", "shreddit-comment"]

 with open("comments.xml", "r", encoding="utf-8") as file:
    xml_content = file.read()

 # Make sure the content is wrapped
 wrapped_content = f"<root>{xml_content}</root>"

 parser = etree.XMLParser(recover=True, encoding="utf-8")
 root = etree.fromstring(wrapped_content.encode("utf-8"), parser=parser)

 processed_elements = process_elements(root, keep_tags)

 new_root = etree.Element("root")
 element_stack = [new_root]

 for depth, element in processed_elements:
    while len(element_stack) > depth + 1:
        element_stack.pop()

    parent = element_stack[-1]
    new_element = etree.SubElement(parent, element.tag)
    new_element.text = element.text
    new_element.tail = element.tail

    for key, value in element.attrib.items():
        new_element.set(key, value)

    element_stack.append(new_element)

 with open("output.xml", "wb") as file:
    for child in new_root:
        file.write(
            etree.tostring(
                child, encoding="utf-8", xml_declaration=False, pretty_print=True
            )
        )
	from lxml import etree


	def process_elements(element, keep_tags, depth=0):
	result = []
	for child in element:
	if child.tag in keep_tags:
	result.append((depth, child))
	result.extend(process_elements(child, keep_tags, depth + 1))
	return result


	keep_tags = ["p", "shreddit-comment"]

	with open("comments.xml", "r", encoding="utf-8") as file:
	xml_content = file.read()

	# Make sure the content is wrapped
	wrapped_content = f"<root>{xml_content}</root>"

	parser = etree.XMLParser(recover=True, encoding="utf-8")
	root = etree.fromstring(wrapped_content.encode("utf-8"), parser=parser)

	processed_elements = process_elements(root, keep_tags)

	new_root = etree.Element("root")
	element_stack = [new_root]

	for depth, element in processed_elements:
	while len(element_stack) > depth + 1:
	element_stack.pop()

	parent = element_stack[-1]
	new_element = etree.SubElement(parent, element.tag)
	new_element.text = element.text
	new_element.tail = element.tail

	for key, value in element.attrib.items():
	new_element.set(key, value)

	element_stack.append(new_element)

	with open("output.xml", "wb") as file:
	for child in new_root:
	file.write(
	etree.tostring(
	child, encoding="utf-8", xml_declaration=False, pretty_print=True
	)
	)