Created
December 9, 2013 11:53
-
-
Save yszou/7871161 to your computer and use it in GitHub Desktop.
邮件内容过滤
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
EMAIL_PATTERN = re.compile(u'[a-z0-9_\-\+\.]+@[a-z0-9_\-\.]+\.[a-z]+', re.I) | |
from lxml.html.clean import Cleaner | |
from lxml.html import defs | |
frozenset = set | |
defs.safe_attrs = frozenset([ | |
'align', 'border', 'cellpadding', 'cellspacing', | |
'cols', 'colspan', | |
'color', | |
'height', | |
'rows', 'rowspan', | |
'size', 'href', | |
'valign', 'vspace', 'width', | |
'style', 'name']) | |
class EMLCleaner(Cleaner): | |
scripts = True | |
javascript = True | |
comments = True | |
style = False | |
links = True | |
meta = True | |
page_structure = True | |
processing_instructions = True | |
embedded = True | |
frames = True | |
forms = True | |
annoying_tags = True | |
remove_tags = set(['body',]) #其它要去掉的标签,但是内容会被放到父级结点 | |
kill_tags = set(['title', 'style']) #要去掉的标签,包括内容 | |
allow_tags = () | |
remove_unknown_tags = True | |
safe_attrs_only = True | |
safe_attrs = defs.safe_attrs | |
add_nofollow = True | |
host_whitelist = () | |
whitelist_tags = set(['',]) | |
def clean_html(self, text): | |
text = super(EMLCleaner, self).clean_html(text) | |
return EMAIL_PATTERN.sub('-', text) | |
if __name__ == '__main__': | |
clean = EMLCleaner() | |
data = open('clean_test/only_body.html', 'r').read() | |
data = data.decode('utf8') | |
print clean.clean_html(data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment