Last active
April 6, 2016 13:06
-
-
Save kezabelle/513ab744e043096aea28936ea6c30bbc to your computer and use it in GitHub Desktop.
Because unicode is fun, there are multiple characters for spaces, dashes, etc. Some of which have HTML equivalents, such as "NO-BREAK SPACE" which I don't want to respect, as the data source may supply that, but the output destination doesn't expect that.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
import unicodedata | |
import ftfy | |
import re | |
""" | |
Taken from: | |
https://www.cs.tut.fi/~jkorpela/chars/spaces.html | |
https://en.wikipedia.org/wiki/Whitespace_character#Unicode | |
http://www.unicode.org/reports/tr44/#GC_Values_Table | |
http://www.fileformat.info/info/unicode/category/Zs/list.htm | |
http://www.fileformat.info/info/unicode/category/Zl/list.htm | |
http://www.fileformat.info/info/unicode/category/Zp/list.htm | |
http://www.fileformat.info/info/unicode/category/Cf/list.htm | |
http://www.fileformat.info/info/unicode/category/So/list.htm | |
http://www.fileformat.info/info/unicode/category/Pd/list.htm | |
http://www.charbase.com/ | |
""" | |
collapses = ( | |
unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'), | |
unicodedata.lookup('ZERO WIDTH SPACE'), | |
unicodedata.lookup('ZERO WIDTH NO-BREAK SPACE'), | |
unicodedata.lookup('NARROW NO-BREAK SPACE'), | |
unicodedata.lookup('ZERO WIDTH NON-JOINER'), | |
unicodedata.lookup('ZERO WIDTH JOINER'), | |
unicodedata.lookup('WORD JOINER'), | |
) | |
collapses_chars = "".join(collapses) | |
collapses_piped = "|".join(collapses) | |
collapses_re = re.compile(r'%s' % collapses_piped, re.UNICODE) | |
spaces = ( | |
unicodedata.lookup('NO-BREAK SPACE'), | |
unicodedata.lookup('NARROW NO-BREAK SPACE'), | |
unicodedata.lookup('EN QUAD'), | |
unicodedata.lookup('EM QUAD'), | |
unicodedata.lookup('EN SPACE'), | |
unicodedata.lookup('EM SPACE'), | |
unicodedata.lookup('THREE-PER-EM SPACE'), | |
unicodedata.lookup('FOUR-PER-EM SPACE'), | |
unicodedata.lookup('SIX-PER-EM SPACE'), | |
unicodedata.lookup('FIGURE SPACE'), | |
unicodedata.lookup('PUNCTUATION SPACE'), | |
unicodedata.lookup('THIN SPACE'), | |
unicodedata.lookup('HAIR SPACE'), | |
unicodedata.lookup('MEDIUM MATHEMATICAL SPACE'), | |
unicodedata.lookup('IDEOGRAPHIC SPACE'), | |
unicodedata.lookup('IDEOGRAPHIC HALF FILL SPACE'), | |
unicodedata.lookup('SYMBOL FOR SPACE'), | |
unicodedata.lookup('TAG SPACE'), | |
) | |
spaces_chars = "".join(spaces) | |
spaces_piped = "|".join(spaces) | |
spaces_re = re.compile(r'%s' % spaces_piped, re.UNICODE) | |
dashes = ( | |
unicodedata.lookup('OGHAM SPACE MARK'), | |
unicodedata.lookup('HYPHEN'), | |
unicodedata.lookup('SOFT HYPHEN'), | |
unicodedata.lookup('HYPHEN-MINUS'), | |
unicodedata.lookup('NON-BREAKING HYPHEN'), | |
unicodedata.lookup('FIGURE DASH'), | |
unicodedata.lookup('EN DASH'), | |
unicodedata.lookup('EM DASH'), | |
unicodedata.lookup('HORIZONTAL BAR'), | |
unicodedata.lookup('MINUS SIGN'), | |
unicodedata.lookup('SMALL EM DASH'), | |
unicodedata.lookup('SMALL HYPHEN-MINUS'), | |
unicodedata.lookup('FULLWIDTH HYPHEN-MINUS'), | |
unicodedata.lookup('TAG HYPHEN-MINUS'), | |
) | |
dashes_chars = "".join(dashes) | |
dashes_piped = "|".join(dashes) | |
dashes_re = re.compile(r'%s' % dashes_piped, re.UNICODE) | |
seps = ( | |
unicodedata.lookup('LINE SEPARATOR'), | |
unicodedata.lookup('PARAGRAPH SEPARATOR'), | |
) | |
seps_chars = "".join(seps) | |
seps_piped = "|".join(seps) | |
seps_re = re.compile(r'%s' % seps_piped, re.UNICODE) | |
def ftfy_further(value): | |
workers = ( | |
(collapses_re, ""), | |
(spaces_re, " "), | |
(dashes_re, "-"), | |
(seps_re, "\r\n"), | |
) | |
for iteration, handler in enumerate(workers, start=1): | |
regex, replacement = handler | |
value = regex.sub(replacement, value) | |
return value | |
bad_text = """ | |
this is some {collapses} collapsed characters. | |
this is some {spaces} fun space characters. | |
this is some {dashes} dashes characters. | |
this is some {seps} break characters. | |
""".format(collapses=collapses_chars, | |
spaces=spaces_chars, | |
dashes=dashes_chars, | |
seps=seps_chars) | |
ftfy_text = ftfy.fix_text(bad_text) | |
result = ftfy_further(bad_text) | |
print(bad_text) | |
print(ftfy_text) | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment