kezabelle · April 6, 2016 13:06
diff --git a/fix_unicode_fun_characters.py b/fix_unicode_fun_characters.py
 from __future__ import unicode_literals

 import unicodedata
 import ftfy
 import re

 """
 Taken from:
 https://www.cs.tut.fi/~jkorpela/chars/spaces.html
 https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 http://www.unicode.org/reports/tr44/#GC_Values_Table
 http://www.fileformat.info/info/unicode/category/Zs/list.htm
 http://www.fileformat.info/info/unicode/category/Zl/list.htm
 http://www.fileformat.info/info/unicode/category/Zp/list.htm
 http://www.fileformat.info/info/unicode/category/Cf/list.htm
 http://www.fileformat.info/info/unicode/category/So/list.htm
 http://www.fileformat.info/info/unicode/category/Pd/list.htm
 http://www.charbase.com/
 """

 collapses = (
    unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
    unicodedata.lookup('ZERO WIDTH SPACE'),
    unicodedata.lookup('ZERO WIDTH NO-BREAK SPACE'),
    unicodedata.lookup('NARROW NO-BREAK SPACE'),
    unicodedata.lookup('ZERO WIDTH NON-JOINER'),
    unicodedata.lookup('ZERO WIDTH JOINER'),
    unicodedata.lookup('WORD JOINER'),
 )
 collapses_chars = "".join(collapses)
 collapses_piped = "|".join(collapses)
 collapses_re = re.compile(r'%s' % collapses_piped, re.UNICODE)

 spaces = (
    unicodedata.lookup('NO-BREAK SPACE'),
    unicodedata.lookup('NARROW NO-BREAK SPACE'),
    unicodedata.lookup('EN QUAD'),
    unicodedata.lookup('EM QUAD'),
    unicodedata.lookup('EN SPACE'),
    unicodedata.lookup('EM SPACE'),
    unicodedata.lookup('THREE-PER-EM SPACE'),
    unicodedata.lookup('FOUR-PER-EM SPACE'),
    unicodedata.lookup('SIX-PER-EM SPACE'),
    unicodedata.lookup('FIGURE SPACE'),
    unicodedata.lookup('PUNCTUATION SPACE'),
    unicodedata.lookup('THIN SPACE'),
    unicodedata.lookup('HAIR SPACE'),
    unicodedata.lookup('MEDIUM MATHEMATICAL SPACE'),
    unicodedata.lookup('IDEOGRAPHIC SPACE'),
    unicodedata.lookup('IDEOGRAPHIC HALF FILL SPACE'),
    unicodedata.lookup('SYMBOL FOR SPACE'),
    unicodedata.lookup('TAG SPACE'),
 )
 spaces_chars = "".join(spaces)
 spaces_piped = "|".join(spaces)
 spaces_re = re.compile(r'%s' % spaces_piped, re.UNICODE)

 dashes = (
    unicodedata.lookup('OGHAM SPACE MARK'),
    unicodedata.lookup('HYPHEN'),
    unicodedata.lookup('SOFT HYPHEN'),
    unicodedata.lookup('HYPHEN-MINUS'),
    unicodedata.lookup('NON-BREAKING HYPHEN'),
    unicodedata.lookup('FIGURE DASH'),
    unicodedata.lookup('EN DASH'),
    unicodedata.lookup('EM DASH'),
    unicodedata.lookup('HORIZONTAL BAR'),
    unicodedata.lookup('MINUS SIGN'),
    unicodedata.lookup('SMALL EM DASH'),
    unicodedata.lookup('SMALL HYPHEN-MINUS'),
    unicodedata.lookup('FULLWIDTH HYPHEN-MINUS'),
    unicodedata.lookup('TAG HYPHEN-MINUS'),
 )
 dashes_chars = "".join(dashes)
 dashes_piped = "|".join(dashes)
 dashes_re = re.compile(r'%s' % dashes_piped, re.UNICODE)

 seps = (
    unicodedata.lookup('LINE SEPARATOR'),
    unicodedata.lookup('PARAGRAPH SEPARATOR'),
 )
 seps_chars = "".join(seps)
 seps_piped = "|".join(seps)
 seps_re = re.compile(r'%s' % seps_piped, re.UNICODE)


 def ftfy_further(value):
    workers = (
        (collapses_re, ""),
        (spaces_re, " "),
        (dashes_re, "-"),
        (seps_re, "\r\n"),
    )
    for iteration, handler in enumerate(workers, start=1):
        regex, replacement = handler
        value = regex.sub(replacement, value)
    return value


 bad_text = """
 this is some {collapses} collapsed characters.
 this is some {spaces} fun space characters.
 this is some {dashes} dashes characters.
 this is some {seps} break characters.
 """.format(collapses=collapses_chars,
           spaces=spaces_chars,
           dashes=dashes_chars,
           seps=seps_chars)
 ftfy_text = ftfy.fix_text(bad_text)
 result = ftfy_further(bad_text)
 print(bad_text)
 print(ftfy_text)
 print(result)
	from __future__ import unicode_literals

	import unicodedata
	import ftfy
	import re

	"""
	Taken from:
	https://www.cs.tut.fi/~jkorpela/chars/spaces.html
	https://en.wikipedia.org/wiki/Whitespace_character#Unicode
	http://www.unicode.org/reports/tr44/#GC_Values_Table
	http://www.fileformat.info/info/unicode/category/Zs/list.htm
	http://www.fileformat.info/info/unicode/category/Zl/list.htm
	http://www.fileformat.info/info/unicode/category/Zp/list.htm
	http://www.fileformat.info/info/unicode/category/Cf/list.htm
	http://www.fileformat.info/info/unicode/category/So/list.htm
	http://www.fileformat.info/info/unicode/category/Pd/list.htm
	http://www.charbase.com/
	"""

	collapses = (
	unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
	unicodedata.lookup('ZERO WIDTH SPACE'),
	unicodedata.lookup('ZERO WIDTH NO-BREAK SPACE'),
	unicodedata.lookup('NARROW NO-BREAK SPACE'),
	unicodedata.lookup('ZERO WIDTH NON-JOINER'),
	unicodedata.lookup('ZERO WIDTH JOINER'),
	unicodedata.lookup('WORD JOINER'),
	)
	collapses_chars = "".join(collapses)
	collapses_piped = "\|".join(collapses)
	collapses_re = re.compile(r'%s' % collapses_piped, re.UNICODE)

	spaces = (
	unicodedata.lookup('NO-BREAK SPACE'),
	unicodedata.lookup('NARROW NO-BREAK SPACE'),
	unicodedata.lookup('EN QUAD'),
	unicodedata.lookup('EM QUAD'),
	unicodedata.lookup('EN SPACE'),
	unicodedata.lookup('EM SPACE'),
	unicodedata.lookup('THREE-PER-EM SPACE'),
	unicodedata.lookup('FOUR-PER-EM SPACE'),
	unicodedata.lookup('SIX-PER-EM SPACE'),
	unicodedata.lookup('FIGURE SPACE'),
	unicodedata.lookup('PUNCTUATION SPACE'),
	unicodedata.lookup('THIN SPACE'),
	unicodedata.lookup('HAIR SPACE'),
	unicodedata.lookup('MEDIUM MATHEMATICAL SPACE'),
	unicodedata.lookup('IDEOGRAPHIC SPACE'),
	unicodedata.lookup('IDEOGRAPHIC HALF FILL SPACE'),
	unicodedata.lookup('SYMBOL FOR SPACE'),
	unicodedata.lookup('TAG SPACE'),
	)
	spaces_chars = "".join(spaces)
	spaces_piped = "\|".join(spaces)
	spaces_re = re.compile(r'%s' % spaces_piped, re.UNICODE)

	dashes = (
	unicodedata.lookup('OGHAM SPACE MARK'),
	unicodedata.lookup('HYPHEN'),
	unicodedata.lookup('SOFT HYPHEN'),
	unicodedata.lookup('HYPHEN-MINUS'),
	unicodedata.lookup('NON-BREAKING HYPHEN'),
	unicodedata.lookup('FIGURE DASH'),
	unicodedata.lookup('EN DASH'),
	unicodedata.lookup('EM DASH'),
	unicodedata.lookup('HORIZONTAL BAR'),
	unicodedata.lookup('MINUS SIGN'),
	unicodedata.lookup('SMALL EM DASH'),
	unicodedata.lookup('SMALL HYPHEN-MINUS'),
	unicodedata.lookup('FULLWIDTH HYPHEN-MINUS'),
	unicodedata.lookup('TAG HYPHEN-MINUS'),
	)
	dashes_chars = "".join(dashes)
	dashes_piped = "\|".join(dashes)
	dashes_re = re.compile(r'%s' % dashes_piped, re.UNICODE)

	seps = (
	unicodedata.lookup('LINE SEPARATOR'),
	unicodedata.lookup('PARAGRAPH SEPARATOR'),
	)
	seps_chars = "".join(seps)
	seps_piped = "\|".join(seps)
	seps_re = re.compile(r'%s' % seps_piped, re.UNICODE)


	def ftfy_further(value):
	workers = (
	(collapses_re, ""),
	(spaces_re, " "),
	(dashes_re, "-"),
	(seps_re, "\r\n"),
	)
	for iteration, handler in enumerate(workers, start=1):
	regex, replacement = handler
	value = regex.sub(replacement, value)
	return value


	bad_text = """
	this is some {collapses} collapsed characters.
	this is some {spaces} fun space characters.
	this is some {dashes} dashes characters.
	this is some {seps} break characters.
	""".format(collapses=collapses_chars,
	spaces=spaces_chars,
	dashes=dashes_chars,
	seps=seps_chars)
	ftfy_text = ftfy.fix_text(bad_text)
	result = ftfy_further(bad_text)
	print(bad_text)
	print(ftfy_text)
	print(result)