Last active
December 31, 2021 19:21
-
-
Save chris-hailstorm/4989643 to your computer and use it in GitHub Desktop.
Unicode to ASCII / UTF-8 converter for Python dicts, lists, strings and nested combinations of dicts, lists and strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def asciify(data): | |
""" | |
SYNOPSIS | |
Asciifies strings, lists and dicts, and nested versions of same | |
DESCRIPTION | |
The JSON spec (http://www.ietf.org/rfc/rfc4627.txt) -- "JSON text SHALL | |
be encoded in Unicode". For apps that don't use unicode, this function | |
walks through all levels of a JSON data structure and converts each item | |
to ASCII. See http://stackoverflow.com/questions/956867/ for original. | |
Can be used for any nesting of strings / lists / dicts, e.g. a list of | |
dicts, a dict in which values are lists of strings etc. See LIMITATIONS. | |
PARAMETERS | |
data A string, unicode, list or dict, or nested versions of the | |
same types. Typically the string output from json.dumps() | |
or the dict resulting from json.load() or json.loads(). | |
RETURNS | |
A Python dictionary with all keys and values converted to UTF-8. | |
USAGE | |
There are several equivalent ways to use this function. | |
(1) asciify string version of data structure before creating dict: | |
s = json.dumps(x) | |
d = json.loads(asciify(s)) | |
(2) create dict from string version of data structure, then asciify: | |
s = json.dumps(x) | |
d = json.loads(s) | |
d = asciify(d) | |
(3) asciify as the dict is being created via object hook: | |
s = json.dumps(x) | |
d = json.loads(s, object_hook=asciify) | |
Asciifying the string first (approach (1) above) is probably the best | |
approach since the input is a flat string and there's no possibility of | |
the depth traversal stopping due to an unknown type. See LIMITATIONS. | |
EXAMPLES | |
>>> import json | |
>>> s1 = 'ASCII string' | |
>>> type(s1) | |
<type 'str'> | |
>>> s1 = asciify(s1) | |
>>> type(s1) | |
<type 'str'> | |
>>> s2 = u'Unicode string' | |
>>> type(s2) | |
<type 'unicode'> | |
>>> s2 = asciify(s2) | |
>>> type(s2) | |
<type 'str'> | |
>>> s3 = 'Nestl'+unichr(0xe9) | |
>>> print asciify(s3) | |
Nestle | |
>>> asciify(['a','b','c']) | |
['a', 'b', 'c'] | |
>>> asciify([u'a',u'b',u'c']) | |
['a', 'b', 'c'] | |
>>> asciify({'a':'aa','b':'bb','c':'cc'}) | |
{'a': 'aa', 'c': 'cc', 'b': 'bb'} | |
>>> asciify({u'a':'aa','b':u'bb',u'c':u'cc'}) | |
{'a': 'aa', 'c': 'cc', 'b': 'bb'} | |
>>> d = dict(a='a1',b='b2',c=dict(d='d3',e=['e4','e5','e6'],f=dict(g='g7')),h=[8,9,10]) | |
>>> print d | |
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'} | |
>>> print type(d) | |
<type 'dict'> | |
>>> asciistr = json.dumps(d) | |
>>> print asciistr | |
{"a": "a1", "h": [8, 9, 10], "c": {"e": ["e4", "e5", "e6"], "d": "d3", "f": {"g": "g7"}}, "b": "b2"} | |
>>> print type(asciistr) | |
<type 'str'> | |
>>> unidict = json.loads(asciistr) | |
>>> print unidict | |
{u'a': u'a1', u'h': [8, 9, 10], u'c': {u'e': [u'e4', u'e5', u'e6'], u'd': u'd3', u'f': {u'g': u'g7'}}, u'b': u'b2'} | |
>>> print type(unidict) | |
<type 'dict'> | |
>>> unidict == d | |
True | |
>>> asciidict1 = asciify(unidict) | |
>>> print asciidict1 | |
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'} | |
>>> print type(asciidict1) | |
<type 'dict'> | |
>>> asciidict1 == d | |
True | |
>>> asciidict2 = json.loads(asciistr, object_hook=asciify) | |
>>> print asciidict2 | |
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'} | |
>>> print type(asciidict2) | |
<type 'dict'> | |
>>> asciidict2 == d | |
True | |
LIMITATIONS | |
For a multi-layered data structure (dict of lists, list of strings etc.) | |
depth traversal of the data structure stops when the element encountered | |
is not a string, unicode, list or dict. For example, in this dict: | |
> d = {'a': { 'b': [1, 2, set(u'x', u'y'] ), 'c': u'z' } } | |
... the u'x' and u'y' items are contained within a set, and therefore | |
would not be asciified, while u'z' is contained in a dict and would be | |
asciified since the breadth traversal of the structure continues. | |
A future @@todo could be to throw an error if a non-traversable input | |
is used, or have additional parameter that can allow the non-traversable | |
input to be used even though the result is a partial discard of data. | |
""" | |
## | |
## embedded functions | |
## | |
## see http://stackoverflow.com/a/517974 | |
def _remove_accents(data): | |
""" | |
Changes accented letters to non-accented approximation, like Nestle | |
""" | |
return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore') | |
## | |
def _asciify_list(data): | |
""" Ascii-fies list values """ | |
ret = [] | |
for item in data: | |
if isinstance(item, unicode): | |
item = _remove_accents(item) | |
item = item.encode('utf-8') | |
elif isinstance(item, list): | |
item = _asciify_list(item) | |
elif isinstance(item, dict): | |
item = _asciify_dict(item) | |
ret.append(item) | |
return ret | |
# | |
def _asciify_dict(data): | |
""" Ascii-fies dict keys and values """ | |
ret = {} | |
for key, value in data.iteritems(): | |
if isinstance(key, unicode): | |
key = _remove_accents(key) | |
key = key.encode('utf-8') | |
## note new if | |
if isinstance(value, unicode): | |
value = _remove_accents(value) | |
value = value.encode('utf-8') | |
elif isinstance(value, list): | |
value = _asciify_list(value) | |
elif isinstance(value, dict): | |
value = _asciify_dict(value) | |
ret[key] = value | |
return ret | |
## | |
## main function | |
if isinstance(data, list): | |
return _asciify_list(data) | |
elif isinstance(data, dict): | |
return _asciify_dict(data) | |
elif isinstance(data, unicode): | |
data = _remove_accents(data) | |
return data.encode('utf-8') | |
elif isinstance(data, str): | |
return data | |
else: | |
raise TypeError('Input must be dict, list, str or unicode') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment