-
-
Save jannismain/e96666ca4f059c3e5bc28abb711b5c92 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import json | |
class CompactJSONEncoder(json.JSONEncoder): | |
"""A JSON Encoder that puts small containers on single lines.""" | |
CONTAINER_TYPES = (list, tuple, dict) | |
"""Container datatypes include primitives or other containers.""" | |
MAX_WIDTH = 70 | |
"""Maximum width of a container that might be put on a single line.""" | |
MAX_ITEMS = 10 | |
"""Maximum number of items in container that might be put on single line.""" | |
def __init__(self, *args, **kwargs): | |
# using this class without indentation is pointless | |
if kwargs.get("indent") is None: | |
kwargs["indent"] = 4 | |
super().__init__(*args, **kwargs) | |
self.indentation_level = 0 | |
def encode(self, o): | |
"""Encode JSON object *o* with respect to single line lists.""" | |
if isinstance(o, (list, tuple)): | |
return self._encode_list(o) | |
if isinstance(o, dict): | |
return self._encode_object(o) | |
if isinstance(o, float): # Use scientific notation for floats | |
return format(o, "g") | |
return json.dumps( | |
o, | |
skipkeys=self.skipkeys, | |
ensure_ascii=self.ensure_ascii, | |
check_circular=self.check_circular, | |
allow_nan=self.allow_nan, | |
sort_keys=self.sort_keys, | |
indent=self.indent, | |
separators=(self.item_separator, self.key_separator), | |
default=self.default if hasattr(self, "default") else None, | |
) | |
def _encode_list(self, o): | |
if self._put_on_single_line(o): | |
return "[" + ", ".join(self.encode(el) for el in o) + "]" | |
self.indentation_level += 1 | |
output = [self.indent_str + self.encode(el) for el in o] | |
self.indentation_level -= 1 | |
return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]" | |
def _encode_object(self, o): | |
if not o: | |
return "{}" | |
# ensure keys are converted to strings | |
o = {str(k) if k is not None else "null": v for k, v in o.items()} | |
if self.sort_keys: | |
o = dict(sorted(o.items(), key=lambda x: x[0])) | |
if self._put_on_single_line(o): | |
return ( | |
"{ " | |
+ ", ".join( | |
f"{json.dumps(k)}: {self.encode(el)}" for k, el in o.items() | |
) | |
+ " }" | |
) | |
self.indentation_level += 1 | |
output = [ | |
f"{self.indent_str}{json.dumps(k)}: {self.encode(v)}" for k, v in o.items() | |
] | |
self.indentation_level -= 1 | |
return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}" | |
def iterencode(self, o, **kwargs): | |
"""Required to also work with `json.dump`.""" | |
return self.encode(o) | |
def _put_on_single_line(self, o): | |
return ( | |
self._primitives_only(o) | |
and len(o) <= self.MAX_ITEMS | |
and len(str(o)) - 2 <= self.MAX_WIDTH | |
) | |
def _primitives_only(self, o: list | tuple | dict): | |
if isinstance(o, (list, tuple)): | |
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o) | |
elif isinstance(o, dict): | |
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values()) | |
@property | |
def indent_str(self) -> str: | |
if isinstance(self.indent, int): | |
return " " * (self.indentation_level * self.indent) | |
elif isinstance(self.indent, str): | |
return self.indentation_level * self.indent | |
else: | |
raise ValueError( | |
f"indent must either be of type int or str (is: {type(self.indent)})" | |
) | |
if __name__ == "__main__": | |
import sys | |
if "--example" in sys.argv: | |
data = { | |
"compact_object": {"first": "element", "second": 2}, | |
"compact_list": ["first", "second"], | |
"long_list": [ | |
"this", | |
"is", | |
"a", | |
"rather", | |
"long\nlist", | |
"and should be broken up because of its width", | |
], | |
"non_ascii": "汉语", | |
1: 2, | |
} | |
json.dump(data, sys.stdout, cls=CompactJSONEncoder, ensure_ascii=False) | |
exit() | |
json.dump(json.load(sys.stdin), sys.stdout, cls=CompactJSONEncoder) |
Greetings! I -think- this might be exactly what I'm looking for. From the looks of it however, I'm going to spend a day or two, or more, understanding it, learning a bunch more python in the process. A few examples of it's use would go a long way. ..I guess I'll post some when I figure it out :-P ;-)
@mikethe1wheelnut did you ever find out how to use this? All I'm essentially trying to do is iterate over a list of files, open them with json.read()
, and immediately write the data again via json.dump()
using this encoder.
@jannismain maybe this would be a good how-to code snippet for all Python noobs out there (me included ;)
@mikethe1wheelnut @phschimm I have added an example of how to dump dictionaries to json using this encoder. Running the file with the --example
option generates the example output:
$ python3 CompactJSONEncoder.py --example
{
"compact_object": { "first": "element", "second": 2 },
"compact_list": ["first", "second"],
"long_list": [
"this",
"is",
"a",
"rather",
"long",
"list"
]
}
@keithzg Ahh, I see what the problem is: json.dump
works with indent parameters given as int
and str
, while my encoder assumes and int
to multiply my indentation level with.
I have reworked it so indent
can be provided both as int
and str
. Providing indent="\t"
works as expected now.
elif isinstance(o, str): # escape newlines
o = o.replace("\n", "\\n")
return f'"{o}"'
Not only newlines need to be escaped, I think return json.dumps(o)
is better.
Be ware of ensure_ascii
argument or the CJK words will be print as unicode escape.
@PenutChen I'm not sure why I didn't do that in the first place. 😅 Thanks for the hint!
@jannismain, what is the license of this code? (just making sure I can reuse it)
Note: sort_keys=True
when calling dumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder)
doesn't seem to work. Indent does work though.
@jannismain, what is the license of this code? (just making sure I can reuse it)
Thanks for asking! Feel free to reuse it in any way you want. Consider its license as MIT.
Note:
sort_keys=True
when callingdumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder)
doesn't seem to work. Indent does work though.
I‘ll have a look at sorting when I get a chance. If somebody knows how to preserve the sorting feature with this way of custom encoding in the meantime, let me know 🙂
@jannismain maybe done by this way in _encode_object
?
items = [(k, el) for k, el in o.items()]
if self.sort_keys:
items = sorted(items)
# (... for k, el in items)
For me, inserting the following branch in L31 does the trick:
if self.sort_keys:
o = dict(sorted(o.items()))
@jannismain
Excuse me, can you tell me how to use this? I have a small script that writes data from a binary file to json. How should I use it with your compact version?
My script:
def readInt(file, size):
return int.from_bytes(file.read(size), "little")
with open("club.dat", 'rb') as datFile:
datFile.read(8)
size = readInt(datFile, 4)
clubs = []
for i in range(size):
print(f"#{i + 1}/{size} - {hex(datFile.tell())}")
club = {}
club['uid'] = readInt(datFile, 4)
club['player'] = [(readInt(datFile, 4)) for i in range(readInt(datFile, 2))]
club['mainClubId'] = readInt(datFile, 4)
club['isNational'] = readInt(datFile, 2)
clubs.append(club)
with open("clubs.json", 'wt') as jsonFile:
json.dump(clubs, jsonFile, ensure_ascii=False, indent=2)
@Royal724 Line 118 is what you are looking for:
json.dump(data, stream, cls=CompactJSONEncoder)
@@jannismain
Thank you for your response. I downloaded your module and crossed it to the same folder where my script is, then made the changes I wrote below, but it didn't work. My json file stays as it is, no compact format. I'm sorry, I'm making a mistake somewhere, but I can't figure out where, as I don't have much experience.
And created a new folder pycache in my folder and in it is the file CompactJSONEncoder.cpython-311.pyc
from CompactJSONEncoder import CompactJSONEncoder
json.dump(clubs, jsonFile, cls=CompactJSONEncoder)
@Royal724 maybe your json file have too long width or too many items, try to set MAX_ITEMS
or MAX_WIDTH
to a larger number.
@PenutChen
Yes, that was the point, thank you.
@jannismain
Excellent work! Thank you so much!
There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner str()
might be a solution…
There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner
str()
might be a solution…
You are right, I didn’t even think to treat the keys in any way.. will push a revision soon to address this! 👍
@olin256 I'm now converting keys to string to ensure the output produced is valid JSON.
@oesteban @PenutChen sort_keys=True
is now supported 👍
Little suggestion to also correctly process Numpy types:
def encode(self, o):
"""Encode JSON object *o* with respect to single line lists."""
if isinstance(o, (list, tuple)):
return self._encode_list(o)
elif isinstance(o, dict):
return self._encode_object(o)
if isinstance(o, float): # Use scientific notation for floats
return format(o, "g")
elif isinstance(o, np.integer): # process numpy integers
return self.encode(int(o))
elif isinstance(o, np.floating): # process numpy floats
return self.encode(float(o))
elif isinstance(o, np.ndarray): # flatten numpy arrays as lists
return self._encode_list(o.tolist())
return json.dumps(
o,
skipkeys=self.skipkeys,
ensure_ascii=self.ensure_ascii,
check_circular=self.check_circular,
allow_nan=self.allow_nan,
sort_keys=self.sort_keys,
indent=self.indent,
separators=(self.item_separator, self.key_separator),
default=self.default if hasattr(self, "default") else None,
)
Also, personally, I would remove the format(o, "g")
part, as it sometimes results in converting floats into exponential notation.
@keithzg I'm wondering whether
indent=r'\t'
would work 🤔