Skip to content

Instantly share code, notes, and snippets.

@bttger
Created July 28, 2024 00:25
Show Gist options
  • Save bttger/a214012c13d61a98c4e3ebbe5d31ccc6 to your computer and use it in GitHub Desktop.
Save bttger/a214012c13d61a98c4e3ebbe5d31ccc6 to your computer and use it in GitHub Desktop.
Extract comments on Reddit posts
from lxml import etree
def process_elements(element, keep_tags, depth=0):
result = []
for child in element:
if child.tag in keep_tags:
result.append((depth, child))
result.extend(process_elements(child, keep_tags, depth + 1))
return result
keep_tags = ["p", "shreddit-comment"]
with open("comments.xml", "r", encoding="utf-8") as file:
xml_content = file.read()
# Make sure the content is wrapped
wrapped_content = f"<root>{xml_content}</root>"
parser = etree.XMLParser(recover=True, encoding="utf-8")
root = etree.fromstring(wrapped_content.encode("utf-8"), parser=parser)
processed_elements = process_elements(root, keep_tags)
new_root = etree.Element("root")
element_stack = [new_root]
for depth, element in processed_elements:
while len(element_stack) > depth + 1:
element_stack.pop()
parent = element_stack[-1]
new_element = etree.SubElement(parent, element.tag)
new_element.text = element.text
new_element.tail = element.tail
for key, value in element.attrib.items():
new_element.set(key, value)
element_stack.append(new_element)
with open("output.xml", "wb") as file:
for child in new_root:
file.write(
etree.tostring(
child, encoding="utf-8", xml_declaration=False, pretty_print=True
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment