Skip to content

Instantly share code, notes, and snippets.

@mitchellrj
Created February 21, 2012 15:58
Show Gist options
  • Save mitchellrj/1877134 to your computer and use it in GitHub Desktop.
Save mitchellrj/1877134 to your computer and use it in GitHub Desktop.
How to beat unicode
# http://pypi.python.org/pypi/chardet
import chardet
# List of your preferred charsets, or detect those available automatically (see below)
CHARSETS = ['ascii', 'latin_1', 'utf_8']
def get_charset(string):
"""Given a string or unicode object, returns the simplest available
character set which may be used to encode/decode its value, as
appropriate, or None if the character set cannot be determined.
"""
charset = None
if isinstance(string, unicode):
pass
elif isinstance(string, basestring):
tmp_charset = chardet.detect(string)['encoding']
if tmp_charset and tmp_charset.lower() in CHARSETS:
string = unicode(string, tmp_charset)
else:
for tmp_charset in CHARSETS[::-1]:
try:
string = unicode(string, tmp_charset)
except (LookupError, UnicodeError):
pass
else:
break
if not isinstance(string, unicode):
return None
else:
return None
for try_charset in CHARSETS:
try:
string.encode(try_charset)
except (LookupError, UnicodeError):
pass
else:
charset = try_charset
break
if charset is None:
return charset
return charset
def safe_unicode(string):
"""Given a string or unicode object, returns a tuple of the input
value as a unicode object and the character set with which it
was decoded and/or may be encoded.
"""
charset = get_charset(string)
if charset is not None and \
isinstance(string, basestring) and \
not isinstance(string, unicode):
string = unicode(string, charset)
return (string, charset)
def safe_bytestring(string):
"""Given a string or unicode object, returns a tuple of the input
value as a byte string and the character set with which it was
encoded and/or may be decoded.
"""
charset = get_charset(string)
if charset is not None and isinstance(string, unicode):
string = string.encode(charset)
return (string, charset)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment