Created
February 21, 2012 15:58
-
-
Save mitchellrj/1877134 to your computer and use it in GitHub Desktop.
How to beat unicode
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://pypi.python.org/pypi/chardet | |
import chardet | |
# List of your preferred charsets, or detect those available automatically (see below) | |
CHARSETS = ['ascii', 'latin_1', 'utf_8'] | |
def get_charset(string): | |
"""Given a string or unicode object, returns the simplest available | |
character set which may be used to encode/decode its value, as | |
appropriate, or None if the character set cannot be determined. | |
""" | |
charset = None | |
if isinstance(string, unicode): | |
pass | |
elif isinstance(string, basestring): | |
tmp_charset = chardet.detect(string)['encoding'] | |
if tmp_charset and tmp_charset.lower() in CHARSETS: | |
string = unicode(string, tmp_charset) | |
else: | |
for tmp_charset in CHARSETS[::-1]: | |
try: | |
string = unicode(string, tmp_charset) | |
except (LookupError, UnicodeError): | |
pass | |
else: | |
break | |
if not isinstance(string, unicode): | |
return None | |
else: | |
return None | |
for try_charset in CHARSETS: | |
try: | |
string.encode(try_charset) | |
except (LookupError, UnicodeError): | |
pass | |
else: | |
charset = try_charset | |
break | |
if charset is None: | |
return charset | |
return charset | |
def safe_unicode(string): | |
"""Given a string or unicode object, returns a tuple of the input | |
value as a unicode object and the character set with which it | |
was decoded and/or may be encoded. | |
""" | |
charset = get_charset(string) | |
if charset is not None and \ | |
isinstance(string, basestring) and \ | |
not isinstance(string, unicode): | |
string = unicode(string, charset) | |
return (string, charset) | |
def safe_bytestring(string): | |
"""Given a string or unicode object, returns a tuple of the input | |
value as a byte string and the character set with which it was | |
encoded and/or may be decoded. | |
""" | |
charset = get_charset(string) | |
if charset is not None and isinstance(string, unicode): | |
string = string.encode(charset) | |
return (string, charset) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment