Created
May 29, 2009 20:25
-
-
Save braveulysses/120193 to your computer and use it in GitHub Desktop.
HTML sanitization using Python and BeautifulSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sanitize(untrusted_html, additional_tags=None): | |
"""Strips potentially harmful tags and attributes from HTML, but preserves | |
all tags in a whitelist. | |
Passing the list additional_tags will add the specified tags to the whitelist. | |
The sanitizer does NOT encode reserved characters into XML entities. It is up | |
to the template code, if any, to take care of that. | |
Based on the work of: | |
- Tom Insam <http://jerakeen.org/blog/2008/05/sanitizing-comments-with-python/> | |
- akaihola <http://www.djangosnippets.org/snippets/169/> | |
""" | |
# Allow these tags. This can be changed to whatever you please, of course, | |
# either by changing the list in code or by passing alt_whitelist. | |
tag_whitelist = [ | |
'a', 'abbr', 'address', 'b', 'code', | |
'cite', 'code', 'em', 'i', 'ins', 'kbd', | |
'q', 'samp', 'small', 'strike', 'strong', 'sub', | |
'sup', 'var' | |
] | |
if additional_tags is not None: | |
tag_whitelist.extend(additional_tags) | |
# Allow only these attributes on these tags. No other tags are allowed | |
# any attributes. | |
attr_whitelist = { | |
'a': ['href', 'title', 'hreflang'], | |
'img': ['src', 'width', 'height', 'alt', 'title'] | |
} | |
# Remove these tags, complete with contents. | |
tag_blacklist = [ 'script', 'style' ] | |
attributes_with_urls = [ 'href', 'src' ] | |
soup = BeautifulSoup(untrusted_html) | |
# Remove HTML comments | |
for comment in soup.findAll( | |
text=lambda text: isinstance(text, Comment)): | |
comment.extract() | |
# Remove unwanted tags | |
for tag in soup.findAll(): | |
# Remove blacklisted tags and their contents. | |
if tag.name.lower() in tag_blacklist: | |
tag.extract() | |
# Hide non-whitelisted tags. | |
elif tag.name.lower() not in tag_whitelist: | |
tag.hidden = True | |
else: | |
for attr in tag.attrs: | |
# Attributes in the attr_whitelist are considered, but on | |
# a per-tag basis. | |
if tag.name.lower() in attr_whitelist and attr[0].lower() in attr_whitelist[ tag.name.lower() ]: | |
# Some attributes contain urls.. | |
if attr[0].lower() in attributes_with_urls: | |
# .. so make sure they're nice urls | |
if not re.match(r'(https?|ftp)://', attr[1].lower()): | |
tag.attrs.remove(attr) | |
else: | |
# Non-whitelisted attributes are removed entirely. | |
tag.attrs.remove(attr) | |
return unicode(soup) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for sharing!