Created
January 30, 2015 11:22
-
-
Save un1t/6fabf38868e5ff4c8ee5 to your computer and use it in GitHub Desktop.
html sanitizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
from html5lib import HTMLParser | |
from html5lib.tokenizer import HTMLTokenizer | |
from html5lib.sanitizer import HTMLSanitizerMixin | |
class WysiwygSanitizerMixin(HTMLSanitizerMixin): | |
allowed_elements = ['b','i','strong', 'em', 'strike', 'a'] | |
allowed_attributes = ['href'] | |
allowed_css_properties = [] | |
allowed_css_keywords = [] | |
allowed_svg_properties = [] | |
class WysiwygSanitizer(HTMLTokenizer, WysiwygSanitizerMixin): | |
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, | |
lowercaseElementName=False, lowercaseAttrName=False): | |
#Change case matching defaults as we only output lowercase html anyway | |
#This solution doesn't seem ideal... | |
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, | |
lowercaseElementName, lowercaseAttrName) | |
def __iter__(self): | |
for token in HTMLTokenizer.__iter__(self): | |
token = self.sanitize_token(token) | |
if token: | |
yield token | |
def wysiwyg_sanitize(html): | |
parser = HTMLParser(tokenizer=WysiwygSanitizer) | |
return parser.parseFragment(html).toxml().decode('utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment