Skip to content

Instantly share code, notes, and snippets.

@jannismain
Last active October 25, 2024 15:50
Show Gist options
  • Save jannismain/e96666ca4f059c3e5bc28abb711b5c92 to your computer and use it in GitHub Desktop.
Save jannismain/e96666ca4f059c3e5bc28abb711b5c92 to your computer and use it in GitHub Desktop.
A JSON Encoder in Python, that puts small lists on single lines.
#!/usr/bin/env python3
from __future__ import annotations
import json
class CompactJSONEncoder(json.JSONEncoder):
"""A JSON Encoder that puts small containers on single lines."""
CONTAINER_TYPES = (list, tuple, dict)
"""Container datatypes include primitives or other containers."""
MAX_WIDTH = 70
"""Maximum width of a container that might be put on a single line."""
MAX_ITEMS = 10
"""Maximum number of items in container that might be put on single line."""
def __init__(self, *args, **kwargs):
# using this class without indentation is pointless
if kwargs.get("indent") is None:
kwargs["indent"] = 4
super().__init__(*args, **kwargs)
self.indentation_level = 0
def encode(self, o):
"""Encode JSON object *o* with respect to single line lists."""
if isinstance(o, (list, tuple)):
return self._encode_list(o)
if isinstance(o, dict):
return self._encode_object(o)
if isinstance(o, float): # Use scientific notation for floats
return format(o, "g")
return json.dumps(
o,
skipkeys=self.skipkeys,
ensure_ascii=self.ensure_ascii,
check_circular=self.check_circular,
allow_nan=self.allow_nan,
sort_keys=self.sort_keys,
indent=self.indent,
separators=(self.item_separator, self.key_separator),
default=self.default if hasattr(self, "default") else None,
)
def _encode_list(self, o):
if self._put_on_single_line(o):
return "[" + ", ".join(self.encode(el) for el in o) + "]"
self.indentation_level += 1
output = [self.indent_str + self.encode(el) for el in o]
self.indentation_level -= 1
return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]"
def _encode_object(self, o):
if not o:
return "{}"
# ensure keys are converted to strings
o = {str(k) if k is not None else "null": v for k, v in o.items()}
if self.sort_keys:
o = dict(sorted(o.items(), key=lambda x: x[0]))
if self._put_on_single_line(o):
return (
"{ "
+ ", ".join(
f"{json.dumps(k)}: {self.encode(el)}" for k, el in o.items()
)
+ " }"
)
self.indentation_level += 1
output = [
f"{self.indent_str}{json.dumps(k)}: {self.encode(v)}" for k, v in o.items()
]
self.indentation_level -= 1
return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}"
def iterencode(self, o, **kwargs):
"""Required to also work with `json.dump`."""
return self.encode(o)
def _put_on_single_line(self, o):
return (
self._primitives_only(o)
and len(o) <= self.MAX_ITEMS
and len(str(o)) - 2 <= self.MAX_WIDTH
)
def _primitives_only(self, o: list | tuple | dict):
if isinstance(o, (list, tuple)):
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o)
elif isinstance(o, dict):
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values())
@property
def indent_str(self) -> str:
if isinstance(self.indent, int):
return " " * (self.indentation_level * self.indent)
elif isinstance(self.indent, str):
return self.indentation_level * self.indent
else:
raise ValueError(
f"indent must either be of type int or str (is: {type(self.indent)})"
)
if __name__ == "__main__":
import sys
if "--example" in sys.argv:
data = {
"compact_object": {"first": "element", "second": 2},
"compact_list": ["first", "second"],
"long_list": [
"this",
"is",
"a",
"rather",
"long\nlist",
"and should be broken up because of its width",
],
"non_ascii": "汉语",
1: 2,
}
json.dump(data, sys.stdout, cls=CompactJSONEncoder, ensure_ascii=False)
exit()
json.dump(json.load(sys.stdin), sys.stdout, cls=CompactJSONEncoder)
@svyatoslav-ps
Copy link

change
def encode(self, o):
to
def iterencode(self, o):
to make it works

@jannismain
Copy link
Author

jannismain commented Feb 19, 2021

@svyatoslav-ps What do you mean by

to make it works

How are you using the encoder?

@pvandegeer
Copy link

I love it, thanks!

One thing though: It fails if you don't specify indent, i.e.:
encoder = CompactJSONEncoder(indent=2)

@jannismain
Copy link
Author

Thanks for pointing that out :-) I have fixed that in the most recent version..

@chrisinmtown
Copy link

chrisinmtown commented Aug 26, 2021

Thank you for posting this! I would like the same thing, slightly fewer newlines in a very large JSON output. I'm actually using indent=0 to keep the whitespace at a minimum. I just don't care for a multi-megabyte JSON file with a single newline waaaaaaaaay at the end, it's too hostile to grep & other command-line friends.

I just learned today that json.dump() and json.dumps() behave slightly differently. The dump method calls encoder method iterencode(); the dumps method calls encoder method encode(). Maybe @svyatoslav-ps was passing this custom class to the json.dump() method?

Also see: https://stackoverflow.com/questions/52939176/json-encoder-different-results-for-json-dump-and-json-dumps

Anyhow, I tried using your CompactJSONEncoder class in Python 3.8 and passing it to the json.dump method. I changed encode() to iterencode(), that method is called with a keyword argument _one_shot=True. I changed the method signature to def iterencode(self, o, **kwargs): which accepts but ignores that _one_shot argument. With that change the code on this page yields a compact format! Unfortunately for me it does not honor the sort_keys parameter.

Preserving most JSON encoder behaviors while making the output compact seems to be quite a challenge!

@jannismain
Copy link
Author

I have recently stumbled upon the same difference in behavior between json.dump and json.dumps.

My solution was to refactor my json logic into json.dumps calls, but of course this is not ideal for a more general solution.

I have updated my version to include your iterencode suggestion. I'm not sure in which cases this might break, but it at least works for the simplest use cases (dumping json to file all at once).

I have also checked the sort_keys option being ignored. There is probably no way around this apart from re-implement sorting in the custom encode function, as we never call the parent's encode method for collection types. For my own use-case, this is not required right now, but if you find yourself needing the sort_keys option I would be interested in your solution :-)

@mikethe1wheelnut
Copy link

Greetings! I -think- this might be exactly what I'm looking for. From the looks of it however, I'm going to spend a day or two, or more, understanding it, learning a bunch more python in the process. A few examples of it's use would go a long way. ..I guess I'll post some when I figure it out :-P ;-)

@keithzg
Copy link

keithzg commented Aug 16, 2022

Allllllmost exactly what I'm looking for! However as a diehard tabs partisan, that I get TypeError: can't multiply sequence by non-int of type 'str' if I try and use indent='\t' is unfortunate, and I might have to dig in and kludge together a way to make that work . . . still, for my uses, falling back to dirty dirty spaces is still an improvement over the stark choice of either newlines everywhere or a single compact blob!

@jannismain
Copy link
Author

@keithzg I'm wondering whether indent=r'\t' would work 🤔

@phschimm
Copy link

phschimm commented Aug 29, 2022

Greetings! I -think- this might be exactly what I'm looking for. From the looks of it however, I'm going to spend a day or two, or more, understanding it, learning a bunch more python in the process. A few examples of it's use would go a long way. ..I guess I'll post some when I figure it out :-P ;-)

@mikethe1wheelnut did you ever find out how to use this? All I'm essentially trying to do is iterate over a list of files, open them with json.read(), and immediately write the data again via json.dump() using this encoder.

@jannismain maybe this would be a good how-to code snippet for all Python noobs out there (me included ;)

@jannismain
Copy link
Author

jannismain commented Aug 29, 2022

@mikethe1wheelnut @phschimm I have added an example of how to dump dictionaries to json using this encoder. Running the file with the --example option generates the example output:

$ python3 CompactJSONEncoder.py --example
{
    "compact_object": { "first": "element", "second": 2 },
    "compact_list": ["first", "second"],
    "long_list": [
        "this",
        "is",
        "a",
        "rather",
        "long",
        "list"
    ]
}

@jannismain
Copy link
Author

jannismain commented Aug 29, 2022

@keithzg Ahh, I see what the problem is: json.dump works with indent parameters given as int and str, while my encoder assumes and int to multiply my indentation level with.

I have reworked it so indent can be provided both as int and str. Providing indent="\t" works as expected now.

@PenutChen
Copy link

PenutChen commented Sep 8, 2022

elif isinstance(o, str):  # escape newlines
    o = o.replace("\n", "\\n")
    return f'"{o}"'

Not only newlines need to be escaped, I think return json.dumps(o) is better.
Be ware of ensure_ascii argument or the CJK words will be print as unicode escape.

@jannismain
Copy link
Author

@PenutChen I'm not sure why I didn't do that in the first place. 😅 Thanks for the hint!

@oesteban
Copy link

oesteban commented Dec 11, 2023

@jannismain, what is the license of this code? (just making sure I can reuse it)

@oesteban
Copy link

Note: sort_keys=True when calling dumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder) doesn't seem to work. Indent does work though.

@jannismain
Copy link
Author

@jannismain, what is the license of this code? (just making sure I can reuse it)

Thanks for asking! Feel free to reuse it in any way you want. Consider its license as MIT.

@jannismain
Copy link
Author

Note: sort_keys=True when calling dumps(..., sort_keys=True, indent=2, cls=CompactJSONEncoder) doesn't seem to work. Indent does work though.

I‘ll have a look at sorting when I get a chance. If somebody knows how to preserve the sorting feature with this way of custom encoding in the meantime, let me know 🙂

@PenutChen
Copy link

@jannismain maybe done by this way in _encode_object?

items = [(k, el) for k, el in o.items()]
if self.sort_keys:
    items = sorted(items)

# (... for k, el in items)

@oesteban
Copy link

For me, inserting the following branch in L31 does the trick:

            if self.sort_keys:
                o = dict(sorted(o.items()))

@Royal724
Copy link

Royal724 commented Jan 10, 2024

@jannismain
Excuse me, can you tell me how to use this? I have a small script that writes data from a binary file to json. How should I use it with your compact version?

My script:

def readInt(file, size):
    return int.from_bytes(file.read(size), "little")

with open("club.dat", 'rb') as datFile:
    datFile.read(8)
    size = readInt(datFile, 4)
    clubs = []
    for i in range(size):
        print(f"#{i + 1}/{size} - {hex(datFile.tell())}")
        club = {}
        club['uid'] = readInt(datFile, 4)
        club['player'] = [(readInt(datFile, 4)) for i in range(readInt(datFile, 2))]
        club['mainClubId'] = readInt(datFile, 4)
        club['isNational'] = readInt(datFile, 2)
        clubs.append(club)

with open("clubs.json", 'wt') as jsonFile:
    json.dump(clubs, jsonFile, ensure_ascii=False, indent=2)

@jannismain
Copy link
Author

jannismain commented Jan 10, 2024

@Royal724 Line 118 is what you are looking for:
json.dump(data, stream, cls=CompactJSONEncoder)

@Royal724
Copy link

Royal724 commented Jan 11, 2024

@@jannismain
Thank you for your response. I downloaded your module and crossed it to the same folder where my script is, then made the changes I wrote below, but it didn't work. My json file stays as it is, no compact format. I'm sorry, I'm making a mistake somewhere, but I can't figure out where, as I don't have much experience.
And created a new folder pycache in my folder and in it is the file CompactJSONEncoder.cpython-311.pyc

from CompactJSONEncoder import CompactJSONEncoder

json.dump(clubs, jsonFile, cls=CompactJSONEncoder)

@PenutChen
Copy link

@Royal724 maybe your json file have too long width or too many items, try to set MAX_ITEMS or MAX_WIDTH to a larger number.

@Royal724
Copy link

@PenutChen
Yes, that was the point, thank you.

@jannismain
Excellent work! Thank you so much!

@olin256
Copy link

olin256 commented Jan 25, 2024

There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner str() might be a solution…

@jannismain
Copy link
Author

There's a small bug: If the keys of a dict are integers of floats, they don't get converted to strings. Perhaps adding inner str() might be a solution…

You are right, I didn’t even think to treat the keys in any way.. will push a revision soon to address this! 👍

@jannismain
Copy link
Author

jannismain commented Jan 30, 2024

@olin256 I'm now converting keys to string to ensure the output produced is valid JSON.

@jannismain
Copy link
Author

@oesteban @PenutChen sort_keys=True is now supported 👍

@Xonxt
Copy link

Xonxt commented Sep 25, 2024

Little suggestion to also correctly process Numpy types:

    def encode(self, o):
        """Encode JSON object *o* with respect to single line lists."""
        if isinstance(o, (list, tuple)):
            return self._encode_list(o)
        elif isinstance(o, dict):
            return self._encode_object(o)
        if isinstance(o, float):  # Use scientific notation for floats
            return format(o, "g") 
        elif isinstance(o, np.integer):  # process numpy integers
            return self.encode(int(o))
        elif isinstance(o, np.floating): # process numpy floats
            return self.encode(float(o))
        elif isinstance(o, np.ndarray): # flatten numpy arrays as lists
            return self._encode_list(o.tolist())

        return json.dumps(
            o,
            skipkeys=self.skipkeys,
            ensure_ascii=self.ensure_ascii,
            check_circular=self.check_circular,
            allow_nan=self.allow_nan,
            sort_keys=self.sort_keys,
            indent=self.indent,
            separators=(self.item_separator, self.key_separator),
            default=self.default if hasattr(self, "default") else None,
        )

Also, personally, I would remove the format(o, "g") part, as it sometimes results in converting floats into exponential notation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment