Created
March 9, 2012 21:50
-
-
Save jmoiron/2008908 to your computer and use it in GitHub Desktop.
lxml Cleaner with attribute whitelist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from lxml.html.clean import Cleaner | |
from lxml.html import defs | |
class AttrWhitelistCleaner(Cleaner): | |
"""An HTML Cleaner that can use an attribute whitelist. Defaults to using | |
the attributes that are whitelisted by default with ``safe_attrs_only`` | |
turned on.""" | |
def __init__(self, **kw): | |
self.attr_whitelist = kw.pop('attr_whitelist', set(defs.safe_attrs)) | |
super(AttrWhitelistCleaner, self).__init__(**kw) | |
def __call__(self, doc): | |
self.safe_attrs_only = False | |
super(AttrWhitelistCleaner, self).__call__(doc) | |
if hasattr(doc, 'getroot'): | |
doc = doc.getroot() | |
whitelist = self.attr_whitelist | |
for el in doc.iter(): | |
attrib = el.attrib | |
for aname in attrib.keys(): | |
if aname not in whitelist: | |
del attrib[aname] | |
example_whitelist = set(defs.safe_attrs) | set(['flashvars']) | |
example_cleaner = AttrWhitelistCleaner(attr_whitelist=example_whitelist) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment