Created
February 6, 2012 19:30
-
-
Save dvdbng/1754258 to your computer and use it in GitHub Desktop.
Prune DOM tree keeping structure and nodes for wich a function is true
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.cElementTree as etree | |
def traverseEtree(tree,traverser): | |
traverser.start(tree) | |
traverser.data(tree.text) | |
for child in tree: | |
traverseEtree(child,traverser) | |
traverser.end(tree) | |
traverser.data(tree.tail) | |
class Traverser(): | |
def __init__(self,fn): | |
self.stack = [] | |
self.res = etree.TreeBuilder() | |
self.last = None | |
self.fn = fn | |
def start(self,elm): | |
self.stack.append(elm) | |
if self.fn(elm): | |
self.addCurrent(elm) | |
def end(self,elm): | |
self.stack.pop() | |
def getCommonAncestor(self): | |
for elm in self.stack[:-1][::-1]: # Parents of the actual element | |
if elm in self.lastStack: | |
return elm | |
def addCurrent(self,elm): | |
if self.last is None: | |
for elm in self.stack: | |
self.res.start(elm.tag,elm.attrib) | |
else: | |
ant = self.getCommonAncestor() | |
for elm in self.lastStack[::-1]: | |
if elm == ant: | |
break | |
else: | |
self.res.end(elm.tag) | |
afterCommon = False | |
for elm in self.stack: | |
if afterCommon: | |
self.res.start(elm.tag,elm.attrib) | |
else: | |
if elm == ant: | |
afterCommon = True | |
self.last = elm | |
self.lastStack = list(self.stack) | |
def close(self): | |
for elm in self.stack[::-1]: | |
self.res.end(elm.tag) | |
return self.res.close() | |
def data(self,data): | |
pass | |
def pruneTree(tree,fn): | |
trv = Traverser(fn) | |
traverseEtree(tree,trv) | |
return trv.close() | |
if __name__ == "__main__": | |
tree = etree.fromstring(""" | |
<a> | |
<b> | |
<c> | |
<d/> | |
<d/> | |
</c> | |
<d><keep/></d> | |
</b> | |
<keep/> | |
<b> | |
<b> | |
<keep> | |
<a></a> | |
</keep> | |
</b> | |
<keep/> | |
<c/> | |
</b> | |
</a> | |
""") | |
# This will return a minimal tree with the same structure that contains all the nodes with tagname "keep", i.e: | |
# <a> | |
# <b> | |
# <d><keep /></d> | |
# </b> | |
# <keep /> | |
# <b> | |
# <b> | |
# <keep /> | |
# </b> | |
# <keep /> | |
# </b> | |
# </a> | |
print etree.tostring(pruneTree(tree,lambda e: e.tag == "keep")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Manejar los nodos de texto se deja como un ejercicio para el lector.