-
-
Save markmysourcecode/b733db6c3b8778633fc2118b06b147d0 to your computer and use it in GitHub Desktop.
string normalization in python: HTML stripping and HTML entity resolution.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# vim:ts=4:sts=4:sw=4:et:wrap:ai:fileencoding=utf-8: | |
"""A collection of string normalization routines. | |
You are probably looking for normalize_string, that does an aggressive (but | |
arguably sound) string normalization process. | |
""" | |
from HTMLParser import HTMLParser | |
import re | |
import unicodedata | |
ORD_A_MIN = ord(u'a') | |
ORD_Z_MIN = ord(u'z') | |
ORD_0 = ord(u'0') | |
ORD_9 = ord(u'9') | |
def try_redecode_utf8(s): | |
"""Try redecoding utf-8 data inside a (faux-)unicode string. | |
>>> try_redecode_utf8(u'T\xc3\xaanis e Esporte') | |
u'T\xeanis e Esporte' | |
""" | |
keep_going = True | |
redecoded = s | |
# Keep redecoding until redecoding fails or there is no difference in output | |
while keep_going: | |
try: | |
if isinstance(s, unicode): | |
redecoded = s.encode('latin1').decode('utf-8') | |
elif isinstance(s, str): | |
redecoded = s.decode('utf-8') | |
keep_going = (s != redecoded) | |
s = redecoded | |
except: | |
keep_going = False | |
return redecoded | |
class HTMLStripper(HTMLParser): | |
"Remove tags and keeps HTML entities intact." | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_starttag(self, tag, attrs): | |
# We took the decision that all/any tag is a word-splitter and thus | |
# is converted to spaces. | |
self.fed.append(' ') | |
def handle_data(self, d): | |
self.fed.append(d) | |
def handle_charref(self, number): | |
self.fed.append('&#%s;' % number) | |
def handle_entityref(self, name): | |
self.fed.append('&%s;' % name) | |
def get_data(self): | |
return u''.join(self.fed) | |
def isPlainASCIIAlphaNum(c): | |
o = ord(c) | |
if (ORD_A_MIN <= o <= ORD_Z_MIN) or ((ORD_0 <= o <= ORD_9)): | |
return True | |
return False | |
def strip_html_and_convert_entities(html): | |
# Previously we used the following code, that depends on BeautifulSoup: | |
# | |
# soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) | |
# return u' '.join(soup.findAll(text=True)) | |
# Src: | |
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python | |
# | |
# But it does not handle numeric character entities correctly. Our new | |
# Approach does not depend on BeautifulSoup and uses HTMLParser, which is | |
# part of python 2.6's std. library. So it is a double-win. :-) | |
# | |
# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python | |
# http://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string | |
parser = HTMLStripper() | |
parser.feed(html) | |
# HTML parser breaks if parsing ends/EOF on a single-letter broken entities | |
# such as 'at&t'. Adding an extra space fixes this. | |
parser.feed(' ') | |
parser.close() | |
return parser.unescape(parser.get_data()) | |
def normalize_case(s): | |
return s.lower() | |
def normalize_diacritics(input_str): | |
# References: | |
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string | |
# http://stackoverflow.com/questions/9042515/normalizing-unicode-text-to-filenames-etc-in-python | |
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str)) | |
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) | |
def normalize_to_plain_ascii(s): | |
only_ascii = s.encode('ASCII', 'replace') # unencodable chars become '?' | |
return unicode(only_ascii) | |
def normalize_to_alphanum_and_spaces(s): | |
return u"".join(i if isPlainASCIIAlphaNum(i) else ' ' for i in s) | |
# def normalize_diacritics_old(s): | |
# """Converts to lowercase, normalizes diacritics and | |
# converts non-alphanumeric chars into space. | |
# """ | |
# s = s.lower() | |
# s = unicodedata.normalize('NFKD', s) | |
# # Filter ascii letters and numbers, discard everyone else | |
# filtered = [] | |
# for c in s: | |
# if isPlainASCIIAlphaNum(c): | |
# filtered.append(c) | |
# elif unicodedata.category(c) == 'Mn': | |
# continue | |
# else: | |
# filtered.append(u' ') | |
# return u' '.join(''.join(filtered).split()) | |
def normalize_prepositions(s): | |
"""Replaces common prepositions by space.""" | |
prepositions = ['e', 'and', 'de', 'do', 'da'] | |
for prep in prepositions: | |
pattern = r'\b' + prep + r'\b' | |
s = re.sub(pattern, " ", s) | |
return s | |
def normalize_whitespace(s): | |
s = re.sub('\s+', ' ', s) | |
return s.strip() | |
def normalize_string(s, fix_utf8=False): | |
if fix_utf8: | |
s = try_redecode_utf8(s) | |
s = strip_html_and_convert_entities(s) | |
s = normalize_case(s) | |
s = normalize_diacritics(s) | |
s = normalize_to_plain_ascii(s) | |
s = normalize_to_alphanum_and_spaces(s) | |
#s = normalize_prepositions(s) | |
s = normalize_whitespace(s) | |
# We don't need to re-normalize to known unicode form (say, NFC) since we | |
# only have plain ASCII data, and alpha-numeric content, for that matter. | |
# There is no "combined" nor "decomposed" unicode content in `s`. | |
return s | |
def main(): | |
sample = [u"Cine e foto", | |
u"Cine & foto", | |
u"Cine&Foto", # BeautifulSoup breaks for this one. | |
u"Cine+foto", | |
u"Cíñe_e.foto", | |
u"<a>Çine e<br>Foto", | |
u'Cine\u65e5\u672c\u8a9eFoto', | |
u'Carrinhos e Veículos', | |
u'<a href="#">Cine <em>(é fóto¬ \u0394ημώ)</em></a>', | |
u'Soul e R&B', # we used break on this one. | |
u'T\xc3\xaanis e Esporte', | |
] | |
from collections import defaultdict | |
categories = defaultdict(list) | |
for i in sample: | |
n = normalize_string(i, fix_utf8=True) | |
categories[n].append(i) | |
for k, v in categories.items(): | |
print k, v | |
return categories | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment