Skip to content

Instantly share code, notes, and snippets.

@sunsongxp
Forked from braveulysses/sanitize_html.py
Last active August 29, 2015 14:12
Show Gist options
  • Select an option

  • Save sunsongxp/7cb84ebe9a8ced66da73 to your computer and use it in GitHub Desktop.

Select an option

Save sunsongxp/7cb84ebe9a8ced66da73 to your computer and use it in GitHub Desktop.
def sanitize(untrusted_html, additional_tags=None):
"""Strips potentially harmful tags and attributes from HTML, but preserves
all tags in a whitelist.
Passing the list additional_tags will add the specified tags to the whitelist.
The sanitizer does NOT encode reserved characters into XML entities. It is up
to the template code, if any, to take care of that.
Based on the work of:
- Tom Insam <http://jerakeen.org/blog/2008/05/sanitizing-comments-with-python/>
- akaihola <http://www.djangosnippets.org/snippets/169/>
"""
# Allow these tags. This can be changed to whatever you please, of course,
# either by changing the list in code or by passing alt_whitelist.
tag_whitelist = [
'a', 'abbr', 'address', 'b', 'code',
'cite', 'code', 'em', 'i', 'ins', 'kbd',
'q', 'samp', 'small', 'strike', 'strong', 'sub',
'sup', 'var'
]
if additional_tags is not None:
tag_whitelist.extend(additional_tags)
# Allow only these attributes on these tags. No other tags are allowed
# any attributes.
attr_whitelist = {
'a': ['href', 'title', 'hreflang'],
'img': ['src', 'width', 'height', 'alt', 'title']
}
# Remove these tags, complete with contents.
tag_blacklist = [ 'script', 'style' ]
attributes_with_urls = [ 'href', 'src' ]
soup = BeautifulSoup(untrusted_html)
# Remove HTML comments
for comment in soup.findAll(
text=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove unwanted tags
for tag in soup.findAll():
# Remove blacklisted tags and their contents.
if tag.name.lower() in tag_blacklist:
tag.extract()
# Hide non-whitelisted tags.
elif tag.name.lower() not in tag_whitelist:
tag.hidden = True
else:
for attr in tag.attrs:
# Attributes in the attr_whitelist are considered, but on
# a per-tag basis.
if tag.name.lower() in attr_whitelist and attr[0].lower() in attr_whitelist[ tag.name.lower() ]:
# Some attributes contain urls..
if attr[0].lower() in attributes_with_urls:
# .. so make sure they're nice urls
if not re.match(r'(https?|ftp)://', attr[1].lower()):
tag.attrs.remove(attr)
else:
# Non-whitelisted attributes are removed entirely.
tag.attrs.remove(attr)
return unicode(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment