Skip to content

Instantly share code, notes, and snippets.

@kmike
Created October 29, 2012 19:32
Show Gist options
  • Save kmike/3975973 to your computer and use it in GitHub Desktop.
Save kmike/3975973 to your computer and use it in GitHub Desktop.
NLTK unicode proposal
try:
# Old versions of unidecode are licensed under Artistic License
# (not GPL) so it may be OK for NLTK to rely on them.
from unidecode import unidecode
def transliterate(txt):
return unidecode(txt).encode('ascii')
except ImportError:
try:
# text-unidecode is an another Text::Unidecode port that is licensed
# under Artistic License; its implementation is worse than unidecode's
# so unidecode is preferred.
from text_unidecode import unidecode
def transliterate(txt):
return unidecode(txt).encode('ascii')
except ImportError:
# this transliterate version only handles accents; this may be ok for
# some European languages but may return empty strings for e.g. Cyrillic
import unicodedata
def transliterate(text):
normalized_text = unicodedata.normalize('NFKD', text)
return normalized_text.encode('ascii', 'ignore')
def to_7bit(text):
return text.encode('ascii', errors='backslashreplace')
def python_2_unicode_compatible(klass):
"""
To support Python 2 and 3 with a single code base, define __str__
and __repr__ methods returning unicode text and apply this decorator
to the class. This decorator fixes __repr__ and __str__ methods
under Python 2; under Python 3 it does nothing.
Original __repr__ and __str__ would be available as
_unicode_repr and __unicode__.
"""
klass._unicode_repr = klass.__repr__
klass.__unicode__ = klass.__str__
if not compat.PY3:
klass.__str__ = lambda self: transliterate(self.__unicode__())
klass.__repr__ = lambda self: to_7bit(self._unicode_repr())
return klass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment