-
-
Save jannismain/e96666ca4f059c3e5bc28abb711b5c92 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import json | |
class CompactJSONEncoder(json.JSONEncoder): | |
"""A JSON Encoder that puts small containers on single lines.""" | |
CONTAINER_TYPES = (list, tuple, dict) | |
"""Container datatypes include primitives or other containers.""" | |
MAX_WIDTH = 70 | |
"""Maximum width of a container that might be put on a single line.""" | |
MAX_ITEMS = 10 | |
"""Maximum number of items in container that might be put on single line.""" | |
def __init__(self, *args, **kwargs): | |
# using this class without indentation is pointless | |
if kwargs.get("indent") is None: | |
kwargs["indent"] = 4 | |
super().__init__(*args, **kwargs) | |
self.indentation_level = 0 | |
def encode(self, o): | |
"""Encode JSON object *o* with respect to single line lists.""" | |
if isinstance(o, (list, tuple)): | |
return self._encode_list(o) | |
if isinstance(o, dict): | |
return self._encode_object(o) | |
if isinstance(o, float): # Use scientific notation for floats | |
return format(o, "g") | |
return json.dumps( | |
o, | |
skipkeys=self.skipkeys, | |
ensure_ascii=self.ensure_ascii, | |
check_circular=self.check_circular, | |
allow_nan=self.allow_nan, | |
sort_keys=self.sort_keys, | |
indent=self.indent, | |
separators=(self.item_separator, self.key_separator), | |
default=self.default if hasattr(self, "default") else None, | |
) | |
def _encode_list(self, o): | |
if self._put_on_single_line(o): | |
return "[" + ", ".join(self.encode(el) for el in o) + "]" | |
self.indentation_level += 1 | |
output = [self.indent_str + self.encode(el) for el in o] | |
self.indentation_level -= 1 | |
return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]" | |
def _encode_object(self, o): | |
if not o: | |
return "{}" | |
# ensure keys are converted to strings | |
o = {str(k) if k is not None else "null": v for k, v in o.items()} | |
if self.sort_keys: | |
o = dict(sorted(o.items(), key=lambda x: x[0])) | |
if self._put_on_single_line(o): | |
return ( | |
"{ " | |
+ ", ".join( | |
f"{json.dumps(k)}: {self.encode(el)}" for k, el in o.items() | |
) | |
+ " }" | |
) | |
self.indentation_level += 1 | |
output = [ | |
f"{self.indent_str}{json.dumps(k)}: {self.encode(v)}" for k, v in o.items() | |
] | |
self.indentation_level -= 1 | |
return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}" | |
def iterencode(self, o, **kwargs): | |
"""Required to also work with `json.dump`.""" | |
return self.encode(o) | |
def _put_on_single_line(self, o): | |
return ( | |
self._primitives_only(o) | |
and len(o) <= self.MAX_ITEMS | |
and len(str(o)) - 2 <= self.MAX_WIDTH | |
) | |
def _primitives_only(self, o: list | tuple | dict): | |
if isinstance(o, (list, tuple)): | |
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o) | |
elif isinstance(o, dict): | |
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values()) | |
@property | |
def indent_str(self) -> str: | |
if isinstance(self.indent, int): | |
return " " * (self.indentation_level * self.indent) | |
elif isinstance(self.indent, str): | |
return self.indentation_level * self.indent | |
else: | |
raise ValueError( | |
f"indent must either be of type int or str (is: {type(self.indent)})" | |
) | |
if __name__ == "__main__": | |
import sys | |
if "--example" in sys.argv: | |
data = { | |
"compact_object": {"first": "element", "second": 2}, | |
"compact_list": ["first", "second"], | |
"long_list": [ | |
"this", | |
"is", | |
"a", | |
"rather", | |
"long\nlist", | |
"and should be broken up because of its width", | |
], | |
"non_ascii": "汉语", | |
1: 2, | |
} | |
json.dump(data, sys.stdout, cls=CompactJSONEncoder, ensure_ascii=False) | |
exit() | |
json.dump(json.load(sys.stdin), sys.stdout, cls=CompactJSONEncoder) |
I love it, thanks!
One thing though: It fails if you don't specify indent, i.e.:
encoder = CompactJSONEncoder(indent=2)
Thanks for pointing that out :-) I have fixed that in the most recent version..
Thank you for posting this! I would like the same thing, slightly fewer newlines in a very large JSON output. I'm actually using indent=0 to keep the whitespace at a minimum. I just don't care for a multi-megabyte JSON file with a single newline waaaaaaaaay at the end, it's too hostile to grep & other command-line friends.
I just learned today that json.dump() and json.dumps() behave slightly differently. The dump method calls encoder method iterencode(); the dumps method calls encoder method encode(). Maybe @svyatoslav-ps was passing this custom class to the json.dump() method?
Anyhow, I tried using your CompactJSONEncoder class in Python 3.8 and passing it to the json.dump method. I changed encode() to iterencode(), that method is called with a keyword argument _one_shot=True
. I changed the method signature to def iterencode(self, o, **kwargs):
which accepts but ignores that _one_shot
argument. With that change the code on this page yields a compact format! Unfortunately for me it does not honor the sort_keys
parameter.
Preserving most JSON encoder behaviors while making the output compact seems to be quite a challenge!
I have recently stumbled upon the same difference in behavior between json.dump
and json.dumps
.
My solution was to refactor my json logic into json.dumps
calls, but of course this is not ideal for a more general solution.
I have updated my version to include your iterencode
suggestion. I'm not sure in which cases this might break, but it at least works for the simplest use cases (dumping json to file all at once).
I have also checked the sort_keys
option being ignored. There is probably no way around this apart from re-implement sorting in the custom encode
function, as we never call the parent's encode
method for collection types. For my own use-case, this is not required right now, but if you find yourself needing the sort_keys
option I would be interested in your solution :-)
Greetings! I -think- this might be exactly what I'm looking for. From the looks of it however, I'm going to spend a day or two, or more, understanding it, learning a bunch more python in the process. A few examples of it's use would go a long way. ..I guess I'll post some when I figure it out :-P ;-)
Allllllmost exactly what I'm looking for! However as a diehard tabs partisan, that I get TypeError: can't multiply sequence by non-int of type 'str'
if I try and use indent='\t'
is unfortunate, and I might have to dig in and kludge together a way to make that work . . . still, for my uses, falling back to dirty dirty spaces is still an improvement over the stark choice of either newlines everywhere or a single compact blob!
@keithzg I'm wondering whether indent=r'\t'
would work 🤔
Greetings! I -think- this might be exactly what I'm looking for. From the looks of it however, I'm going to spend a day or two, or more, understanding it, learning a bunch more python in the process. A few examples of it's use would go a long way. ..I guess I'll post some when I figure it out :-P ;-)
@mikethe1wheelnut did you ever find out how to use this? All I'm essentially trying to do is iterate over a list of files, open them with json.read()
, and immediately write the data again via json.dump()
using this encoder.
@jannismain maybe this would be a good how-to code snippet for all Python noobs out there (me included ;)
@mikethe1wheelnut @phschimm I have added an example of how to dump dictionaries to json using this encoder. Running the file with the --example
option generates the example output:
$ python3 CompactJSONEncoder.py --example
{
"compact_object": { "first": "element", "second": 2 },
"compact_list": ["first", "second"],
"long_list": [
"this",
"is",
"a",
"rather",
"long",
"list"
]
}
@keithzg Ahh, I see what the problem is: json.dump
works with indent parameters given as int
and str
, while my encoder assumes and int
to multiply my indentation level with.
I have reworked it so indent
can be provided both as int
and str
. Providing indent="\t"
works as expected now.
elif isinstance(o, str): # escape newlines
o = o.replace("\n", "\\n")
return f'"{o}"'
Not only newlines need to be escaped, I think return json.dumps(o)
is better.
Be ware of ensure_ascii
argument or the CJK words will be print as unicode escape.
@PenutChen I'm not sure why I didn't do that in the first place. 😅 Thanks for the hint!
@jannismain, what is the license of this code? (just making sure I can reuse it)
Note: sort_keys=True
when calling dumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder)
doesn't seem to work. Indent does work though.
@jannismain, what is the license of this code? (just making sure I can reuse it)
Thanks for asking! Feel free to reuse it in any way you want. Consider its license as MIT.
Note:
sort_keys=True
when callingdumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder)
doesn't seem to work. Indent does work though.
I‘ll have a look at sorting when I get a chance. If somebody knows how to preserve the sorting feature with this way of custom encoding in the meantime, let me know 🙂
@jannismain maybe done by this way in _encode_object
?
items = [(k, el) for k, el in o.items()]
if self.sort_keys:
items = sorted(items)
# (... for k, el in items)
For me, inserting the following branch in L31 does the trick:
if self.sort_keys:
o = dict(sorted(o.items()))
@jannismain
Excuse me, can you tell me how to use this? I have a small script that writes data from a binary file to json. How should I use it with your compact version?
My script:
def readInt(file, size):
return int.from_bytes(file.read(size), "little")
with open("club.dat", 'rb') as datFile:
datFile.read(8)
size = readInt(datFile, 4)
clubs = []
for i in range(size):
print(f"#{i + 1}/{size} - {hex(datFile.tell())}")
club = {}
club['uid'] = readInt(datFile, 4)
club['player'] = [(readInt(datFile, 4)) for i in range(readInt(datFile, 2))]
club['mainClubId'] = readInt(datFile, 4)
club['isNational'] = readInt(datFile, 2)
clubs.append(club)
with open("clubs.json", 'wt') as jsonFile:
json.dump(clubs, jsonFile, ensure_ascii=False, indent=2)
@Royal724 Line 118 is what you are looking for:
json.dump(data, stream, cls=CompactJSONEncoder)
@@jannismain
Thank you for your response. I downloaded your module and crossed it to the same folder where my script is, then made the changes I wrote below, but it didn't work. My json file stays as it is, no compact format. I'm sorry, I'm making a mistake somewhere, but I can't figure out where, as I don't have much experience.
And created a new folder pycache in my folder and in it is the file CompactJSONEncoder.cpython-311.pyc
from CompactJSONEncoder import CompactJSONEncoder
json.dump(clubs, jsonFile, cls=CompactJSONEncoder)
@Royal724 maybe your json file have too long width or too many items, try to set MAX_ITEMS
or MAX_WIDTH
to a larger number.
@PenutChen
Yes, that was the point, thank you.
@jannismain
Excellent work! Thank you so much!
There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner str()
might be a solution…
There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner
str()
might be a solution…
You are right, I didn’t even think to treat the keys in any way.. will push a revision soon to address this! 👍
@olin256 I'm now converting keys to string to ensure the output produced is valid JSON.
@oesteban @PenutChen sort_keys=True
is now supported 👍
Little suggestion to also correctly process Numpy types:
def encode(self, o):
"""Encode JSON object *o* with respect to single line lists."""
if isinstance(o, (list, tuple)):
return self._encode_list(o)
elif isinstance(o, dict):
return self._encode_object(o)
if isinstance(o, float): # Use scientific notation for floats
return format(o, "g")
elif isinstance(o, np.integer): # process numpy integers
return self.encode(int(o))
elif isinstance(o, np.floating): # process numpy floats
return self.encode(float(o))
elif isinstance(o, np.ndarray): # flatten numpy arrays as lists
return self._encode_list(o.tolist())
return json.dumps(
o,
skipkeys=self.skipkeys,
ensure_ascii=self.ensure_ascii,
check_circular=self.check_circular,
allow_nan=self.allow_nan,
sort_keys=self.sort_keys,
indent=self.indent,
separators=(self.item_separator, self.key_separator),
default=self.default if hasattr(self, "default") else None,
)
Also, personally, I would remove the format(o, "g")
part, as it sometimes results in converting floats into exponential notation.
@svyatoslav-ps What do you mean by
How are you using the encoder?