Skip to content

Instantly share code, notes, and snippets.

@jamiejackson
Created November 5, 2015 15:24
Show Gist options
  • Save jamiejackson/a37e8d3dacb33b2dcbc1 to your computer and use it in GitHub Desktop.
Save jamiejackson/a37e8d3dacb33b2dcbc1 to your computer and use it in GitHub Desktop.
Preserve Comments During XML Parsing in Python 2.7+
#!/usr/bin/env python
from xml.etree import ElementTree
class CommentedTreeBuilder ( ElementTree.XMLTreeBuilder ):
def __init__ ( self, html = 0, target = None ):
ElementTree.XMLTreeBuilder.__init__( self, html, target )
self._parser.CommentHandler = self.handle_comment
def handle_comment ( self, data ):
self._target.start( ElementTree.Comment, {} )
self._target.data( data )
self._target.end( ElementTree.Comment )
#!/usr/bin/env python
import xml.etree.ElementTree as ET
from CommentedTreeBuilder import CommentedTreeBuilder
parser = CommentedTreeBuilder()
if __name__ == '__main__':
filename = "/opt/lucee/tomcat/conf/server.xml"
# this is the important part: use the comment-preserving parser
tree = ET.parse(filename, parser)
# get the node to add a child to
engine_node = tree.find("./Service/Engine")
# add a node: Engine.Host
host_node = ET.SubElement(
engine_node,
"Host",
name="local.mysite.com",
appBase="webapps"
)
# add a child to new node: Engine.Host.Context
ET.SubElement(
host_node,
'Context',
path="",
docBase="/path/to/doc/base"
)
tree.write('out.xml')
@jamiejackson
Copy link
Author

Got the parser from Amaury Forgeot d'Arc

@Zylann
Copy link

Zylann commented Oct 7, 2016

ElementTree.XMLTreeBuilder doesn't exists anymore in 3.5.2, it looks like it has been split, the class we have to inherit may be XMLParser.

class CommentedTreeBuilder ( ET.XMLParser ):
    def __init__ ( self, html = 0, target = None, encoding = None ):
        ET.XMLParser.__init__( self, html, target, encoding )
        self._parser.CommentHandler = self.handle_comment

    def handle_comment ( self, data ):
        self._target.start( ET.Comment, {} )
        self._target.data( data )
        self._target.end( ET.Comment )

However I get an error after trying this out:

    self._parser.CommentHandler = self.handle_comment
AttributeError: 'CommentedTreeBuilder' object has no attribute '_parser'

I don't understand why it does that because I saw ElementTree.py and the property is well defined...
I tried without the underscore, same thing.
Actually, none of the properties from XMLParser are available, and testing print(str(self.entity)) even crashes Python (Windows 10, 64 bits).
I wonder if I did anything wrong?

Edit:
This solved the problem for me.

# http://stackoverflow.com/questions/33573807/faithfully-preserve-comments-in-parsed-xml-python-2-7
class CommentedTreeBuilder(ET.TreeBuilder):
    def __init__(self, *args, **kwargs):
        super(CommentedTreeBuilder, self).__init__(*args, **kwargs)

    def comment(self, data):
        self.start(ET.Comment, {})
        self.data(data)
        self.end(ET.Comment)

#------------------------------------------------------------------------------
cparser = ET.XMLParser(target = CommentedTreeBuilder())
def read_xml_file(f):
    return ET.parse(f, parser=cparser)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment