Skip to content

Instantly share code, notes, and snippets.

@kezabelle
Last active April 6, 2016 13:06
Show Gist options
  • Save kezabelle/513ab744e043096aea28936ea6c30bbc to your computer and use it in GitHub Desktop.
Save kezabelle/513ab744e043096aea28936ea6c30bbc to your computer and use it in GitHub Desktop.
Because unicode is fun, there are multiple characters for spaces, dashes, etc. Some of which have HTML equivalents, such as "NO-BREAK SPACE" which I don't want to respect, as the data source may supply that, but the output destination doesn't expect that.
from __future__ import unicode_literals
import unicodedata
import ftfy
import re
"""
Taken from:
https://www.cs.tut.fi/~jkorpela/chars/spaces.html
https://en.wikipedia.org/wiki/Whitespace_character#Unicode
http://www.unicode.org/reports/tr44/#GC_Values_Table
http://www.fileformat.info/info/unicode/category/Zs/list.htm
http://www.fileformat.info/info/unicode/category/Zl/list.htm
http://www.fileformat.info/info/unicode/category/Zp/list.htm
http://www.fileformat.info/info/unicode/category/Cf/list.htm
http://www.fileformat.info/info/unicode/category/So/list.htm
http://www.fileformat.info/info/unicode/category/Pd/list.htm
http://www.charbase.com/
"""
collapses = (
unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
unicodedata.lookup('ZERO WIDTH SPACE'),
unicodedata.lookup('ZERO WIDTH NO-BREAK SPACE'),
unicodedata.lookup('NARROW NO-BREAK SPACE'),
unicodedata.lookup('ZERO WIDTH NON-JOINER'),
unicodedata.lookup('ZERO WIDTH JOINER'),
unicodedata.lookup('WORD JOINER'),
)
collapses_chars = "".join(collapses)
collapses_piped = "|".join(collapses)
collapses_re = re.compile(r'%s' % collapses_piped, re.UNICODE)
spaces = (
unicodedata.lookup('NO-BREAK SPACE'),
unicodedata.lookup('NARROW NO-BREAK SPACE'),
unicodedata.lookup('EN QUAD'),
unicodedata.lookup('EM QUAD'),
unicodedata.lookup('EN SPACE'),
unicodedata.lookup('EM SPACE'),
unicodedata.lookup('THREE-PER-EM SPACE'),
unicodedata.lookup('FOUR-PER-EM SPACE'),
unicodedata.lookup('SIX-PER-EM SPACE'),
unicodedata.lookup('FIGURE SPACE'),
unicodedata.lookup('PUNCTUATION SPACE'),
unicodedata.lookup('THIN SPACE'),
unicodedata.lookup('HAIR SPACE'),
unicodedata.lookup('MEDIUM MATHEMATICAL SPACE'),
unicodedata.lookup('IDEOGRAPHIC SPACE'),
unicodedata.lookup('IDEOGRAPHIC HALF FILL SPACE'),
unicodedata.lookup('SYMBOL FOR SPACE'),
unicodedata.lookup('TAG SPACE'),
)
spaces_chars = "".join(spaces)
spaces_piped = "|".join(spaces)
spaces_re = re.compile(r'%s' % spaces_piped, re.UNICODE)
dashes = (
unicodedata.lookup('OGHAM SPACE MARK'),
unicodedata.lookup('HYPHEN'),
unicodedata.lookup('SOFT HYPHEN'),
unicodedata.lookup('HYPHEN-MINUS'),
unicodedata.lookup('NON-BREAKING HYPHEN'),
unicodedata.lookup('FIGURE DASH'),
unicodedata.lookup('EN DASH'),
unicodedata.lookup('EM DASH'),
unicodedata.lookup('HORIZONTAL BAR'),
unicodedata.lookup('MINUS SIGN'),
unicodedata.lookup('SMALL EM DASH'),
unicodedata.lookup('SMALL HYPHEN-MINUS'),
unicodedata.lookup('FULLWIDTH HYPHEN-MINUS'),
unicodedata.lookup('TAG HYPHEN-MINUS'),
)
dashes_chars = "".join(dashes)
dashes_piped = "|".join(dashes)
dashes_re = re.compile(r'%s' % dashes_piped, re.UNICODE)
seps = (
unicodedata.lookup('LINE SEPARATOR'),
unicodedata.lookup('PARAGRAPH SEPARATOR'),
)
seps_chars = "".join(seps)
seps_piped = "|".join(seps)
seps_re = re.compile(r'%s' % seps_piped, re.UNICODE)
def ftfy_further(value):
workers = (
(collapses_re, ""),
(spaces_re, " "),
(dashes_re, "-"),
(seps_re, "\r\n"),
)
for iteration, handler in enumerate(workers, start=1):
regex, replacement = handler
value = regex.sub(replacement, value)
return value
bad_text = """
this is some {collapses} collapsed characters.
this is some {spaces} fun space characters.
this is some {dashes} dashes characters.
this is some {seps} break characters.
""".format(collapses=collapses_chars,
spaces=spaces_chars,
dashes=dashes_chars,
seps=seps_chars)
ftfy_text = ftfy.fix_text(bad_text)
result = ftfy_further(bad_text)
print(bad_text)
print(ftfy_text)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment