Skip to content

Instantly share code, notes, and snippets.

@Red5d
Created May 15, 2026 15:11
Show Gist options
  • Select an option

  • Save Red5d/79c35dbe989ab70b7e3c759796296f94 to your computer and use it in GitHub Desktop.

Select an option

Save Red5d/79c35dbe989ab70b7e3c759796296f94 to your computer and use it in GitHub Desktop.
Converts from JSON format to TOON format. Implements the spec from: https://github.com/toon-format/spec/blob/main/SPEC.md
"""
json2toon — Convert JSON data to TOON (Token-Oriented Object Notation) format.
TOON is a compact, human-readable encoding of the JSON data model that
minimizes tokens for LLM prompts. It combines YAML-like indentation for
nested objects with CSV-like tabular layout for uniform arrays.
Spec: https://github.com/toon-format/spec/blob/main/SPEC.md (v3.0)
"""
from __future__ import annotations
import json
import math
import re
import sys
from typing import Any
# ── Constants ────────────────────────────────────────────────────────────────
_UNQUOTED_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_.]*$")
_NUMERIC_RE = re.compile(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$")
_VALID_ESCAPES = {"\\\\", '\\"', "\\n", "\\r", "\\t"}
# ── Helpers ──────────────────────────────────────────────────────────────────
def _indent_str(depth: int, indent: int) -> str:
"""Return leading spaces for a given depth."""
return " " * (depth * indent)
def _delim_sym(delimiter: str) -> str:
"""Return the delimiter symbol for use inside bracket headers."""
if delimiter == "\t":
return "\t"
if delimiter == "|":
return "|"
return "" # comma is default — omitted
def _canon_number(v: int | float) -> str:
"""Format a number in TOON canonical form (no exponent, no trailing zeros)."""
if isinstance(v, bool): # bool is subclass of int in Python
return "true" if v else "false"
if isinstance(v, int):
if v == 0 and math.copysign(1, v) < 0:
return "0" # -0 → 0
return str(v)
# float
if math.isnan(v) or math.isinf(v):
return "null"
if v == 0.0 and math.copysign(1, v) < 0:
return "0" # -0.0 → 0
# Use repr for full precision, then normalize
s = repr(v)
# If repr gave us scientific notation, convert to decimal
if "e" in s.lower():
# Format with enough precision to round-trip
s = f"{v:.17g}"
if "e" in s.lower():
# Still scientific — force decimal
d = abs(v)
if d >= 1:
s = f"{v:.0f}"
else:
# Count leading zeros after decimal
frac = f"{v:.17f}"
# Strip trailing zeros
frac = frac.rstrip("0").rstrip(".")
s = frac
# Ensure no exponent in final output
if "e" in s.lower():
s = f"{float(s):.0f}" if abs(float(s)) >= 1 else f"{float(s):.17f}".rstrip("0").rstrip(".")
# Strip trailing zeros after decimal point
if "." in s:
s = s.rstrip("0").rstrip(".")
# Handle -0
if s == "-0":
s = "0"
return s
def _is_safe_unquoted_key(s: str) -> bool:
"""Check if a key can appear unquoted (per §7.3)."""
return bool(_UNQUOTED_KEY_RE.match(s))
def _needs_quoting_value(s: str, delimiter: str, *, is_object_value: bool = False) -> bool:
"""
Determine if a string value needs quoting.
Args:
s: The string value.
delimiter: The active delimiter (comma, tab, or pipe).
is_object_value: True if this is a value in a key: value pair
(adds colon and comma quoting).
"""
if s == "":
return True
# Reserved literals that would be misinterpreted
if s in ("true", "false", "null"):
return True
# Numeric-looking strings
if _NUMERIC_RE.match(s):
return True
# Leading zeros as string (e.g., "05", "0001")
if re.match(r"^-?0\d+", s):
return True
# Leading or trailing whitespace requires quoting
if s != s.strip():
return True
# Control characters (newline, tab, carriage return)
if any(c in s for c in "\n\r\t"):
return True
# Structural characters
if any(c in s for c in "[]{}"):
return True
# Backslash (only valid inside quoted strings as escape introducer)
if "\\" in s:
return True
# Starts with hyphen-space (looks like list item)
if s.startswith("- "):
return True
# Single hyphen (ambiguous with list item marker in any context)
if s == "-":
return True
# Active delimiter
if delimiter in s:
return True
# Colon (can be confused with key: value syntax in some parsing contexts)
if ":" in s:
return True
# Double quote (needs escaping)
if '"' in s:
return True
# Comma — only quote if comma is NOT the active delimiter
# (when comma IS the delimiter, commas in values are handled by quoting;
# when a different delimiter is active, commas are just regular characters)
# Actually: commas in values are only special when comma is the delimiter
# No additional rule needed — delimiter check above handles it.
# Object-value-specific rules (kept for clarity, most handled above)
if is_object_value:
pass # colon and comma already handled above
return False
def _quote_string(s: str) -> str:
"""Quote and escape a string value."""
escaped = (
s.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
return f'"{escaped}"'
def _encode_value(v: Any, delimiter: str, *, is_object_value: bool = False) -> str:
"""Encode a primitive value as a TOON token string."""
if v is None:
return "null"
if isinstance(v, bool):
return "true" if v else "false"
if isinstance(v, (int, float)):
return _canon_number(v)
if isinstance(v, str):
if _needs_quoting_value(v, delimiter, is_object_value=is_object_value):
return _quote_string(v)
return v
# Fallback for unexpected types
return _quote_string(str(v))
def _encode_key(key: str, delimiter: str) -> str:
"""Encode an object key, quoting if necessary."""
if _is_safe_unquoted_key(key):
return key
return _quote_string(key)
def _encode_field_name(name: str, delimiter: str) -> str:
"""Encode a field name in a tabular header (same rules as key)."""
return _encode_key(name, delimiter)
# ── Core encoder ─────────────────────────────────────────────────────────────
class TOONEncoder:
"""
Convert JSON-compatible Python objects to TOON format strings.
Args:
indent: Number of spaces per indentation level (default 2).
delimiter: Delimiter for array values — ',', '\\t', or '|' (default ',').
"""
def __init__(self, indent: int = 2, delimiter: str = ","):
self.indent = indent
self.delimiter = delimiter
def encode(self, data: Any) -> str:
"""Encode a JSON-compatible Python value to TOON format."""
lines: list[str] = []
self._encode_root(data, lines, delimiter=self.delimiter)
return "\n".join(lines)
def _encode_root(self, data: Any, lines: list[str], delimiter: str) -> None:
"""Encode at root level (depth 0)."""
if isinstance(data, dict):
self._encode_object(data, lines, depth=0, delimiter=delimiter)
elif isinstance(data, list):
self._encode_root_array(data, lines, delimiter=delimiter)
else:
# Single primitive at root
lines.append(_encode_value(data, delimiter))
def _encode_root_array(self, arr: list, lines: list[str], delimiter: str) -> None:
"""Encode a root-level array."""
ds = _delim_sym(delimiter)
if not arr:
lines.append(f"[0{ds}]:")
return
# Check if all items are primitives
if all(_is_primitive(x) for x in arr):
vals = delimiter.join(_encode_value(x, delimiter) for x in arr)
lines.append(f"[{len(arr)}{ds}]: {vals}")
return
# Check if all items are dicts with same primitive-only keys (tabular)
if all(isinstance(x, dict) for x in arr):
tab = self._try_tabular(arr, delimiter)
if tab:
fields_str, rows = tab
lines.append(f"[{len(arr)}{ds}]{{{fields_str}}}:")
for row in rows:
lines.append(f" {row}")
return
# List format
lines.append(f"[{len(arr)}{ds}]:")
for item in arr:
self._encode_list_item(item, lines, depth=1, delimiter=delimiter)
def _encode_object(
self, obj: dict, lines: list[str], depth: int, delimiter: str
) -> None:
"""Encode an object at the given depth."""
ind = _indent_str(depth, self.indent)
for key, value in obj.items():
ek = _encode_key(key, delimiter)
if isinstance(value, dict) and value:
# Nested non-empty object: key: then recurse
lines.append(f"{ind}{ek}:")
self._encode_object(value, lines, depth + 1, delimiter)
elif isinstance(value, list):
self._encode_array_field(ek, value, lines, depth, delimiter)
elif isinstance(value, dict) and not value:
# Empty object
lines.append(f"{ind}{ek}:")
else:
# Primitive value
ev = _encode_value(value, delimiter, is_object_value=True)
lines.append(f"{ind}{ek}: {ev}")
def _encode_array_field(
self,
key_str: str,
arr: list,
lines: list[str],
depth: int,
delimiter: str,
) -> None:
"""Encode an array as a field inside an object."""
ind = _indent_str(depth, self.indent)
ds = _delim_sym(delimiter)
if not arr:
lines.append(f"{ind}{key_str}[0{ds}]:")
return
# All primitives → inline
if all(_is_primitive(x) for x in arr):
vals = delimiter.join(_encode_value(x, delimiter) for x in arr)
lines.append(f"{ind}{key_str}[{len(arr)}{ds}]: {vals}")
return
# All dicts → try tabular, else list
if all(isinstance(x, dict) for x in arr):
tab = self._try_tabular(arr, delimiter)
if tab:
fields_str, rows = tab
lines.append(f"{ind}{key_str}[{len(arr)}{ds}]{{{fields_str}}}:")
for row in rows:
lines.append(f"{ind} {row}")
return
# List of objects
lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:")
for item in arr:
self._encode_list_item(item, lines, depth + 1, delimiter)
return
# All arrays (nested arrays) → list of arrays
if all(isinstance(x, list) for x in arr):
lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:")
for item in arr:
self._encode_nested_array_item(item, lines, depth + 1, delimiter)
return
# Mixed types → list format
lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:")
for item in arr:
self._encode_list_item(item, lines, depth + 1, delimiter)
def _encode_nested_array_item(
self, arr: list, lines: list[str], depth: int, delimiter: str
) -> None:
"""Encode a nested array as a list item: `- [N]: v1,v2`."""
ind = _indent_str(depth, self.indent)
ds = _delim_sym(delimiter)
if not arr:
lines.append(f"{ind}- [0{ds}]:")
return
if all(_is_primitive(x) for x in arr):
vals = delimiter.join(_encode_value(x, delimiter) for x in arr)
lines.append(f"{ind}- [{len(arr)}{ds}]: {vals}")
return
# Deeper nesting — recurse as list items under this array header
lines.append(f"{ind}- [{len(arr)}{ds}]:")
for item in arr:
self._encode_list_item(item, lines, depth + 1, delimiter)
def _encode_list_item(
self, item: Any, lines: list[str], depth: int, delimiter: str
) -> None:
"""Encode a single item in a list (expanded) array."""
ind = _indent_str(depth, self.indent)
if _is_primitive(item):
ev = _encode_value(item, delimiter)
lines.append(f"{ind}- {ev}")
return
if isinstance(item, dict):
if not item:
# Empty object → bare hyphen
lines.append(f"{ind}-")
return
self._encode_list_object_item(item, lines, depth, delimiter)
return
if isinstance(item, list):
self._encode_nested_array_item(item, lines, depth, delimiter)
return
def _encode_list_object_item(
self, obj: dict, lines: list[str], depth: int, delimiter: str
) -> None:
"""
Encode an object as a list item.
Per §10:
- First field on the `- ` line (key: value, or array header).
- Subsequent fields at depth+1.
- Special case: if first field is a tabular array, the tabular
header goes on the hyphen line and rows go at depth+2.
- Single-field tabular arrays also use the hyphen-line header.
- Multi-field objects where first field is a tabular array:
header on hyphen line, rows at depth+2, other fields at depth+1.
"""
ind = _indent_str(depth, self.indent)
keys = list(obj.keys())
if len(keys) == 1:
k = keys[0]
v = obj[k]
# Check if the single field is a tabular array
if isinstance(v, list) and v and all(isinstance(x, dict) for x in v):
tab = self._try_tabular(v, delimiter)
if tab:
ek = _encode_key(k, delimiter)
ds = _delim_sym(delimiter)
fields_str, rows = tab
lines.append(f"{ind}- {ek}[{len(v)}{ds}]{{{fields_str}}}:")
for row in rows:
lines.append(f"{ind} {row}")
return
self._emit_first_list_field(k, v, ind, lines, depth, delimiter)
return
# Multi-field object
first_key = keys[0]
first_val = obj[first_key]
# Check if first field is a tabular array
if isinstance(first_val, list) and first_val and all(isinstance(x, dict) for x in first_val):
tab = self._try_tabular(first_val, delimiter)
if tab:
ek = _encode_key(first_key, delimiter)
ds = _delim_sym(delimiter)
fields_str, rows = tab
lines.append(f"{ind}- {ek}[{len(first_val)}{ds}]{{{fields_str}}}:")
for row in rows:
lines.append(f"{ind} {row}")
# Remaining fields at depth+1
for k in keys[1:]:
v = obj[k]
self._emit_list_sibling(k, v, lines, depth, delimiter)
return
# Regular multi-field: first field on hyphen line, rest at depth+1
self._emit_first_list_field(first_key, first_val, ind, lines, depth, delimiter)
for k in keys[1:]:
v = obj[k]
self._emit_list_sibling(k, v, lines, depth, delimiter)
def _emit_first_list_field(
self,
key: str,
value: Any,
ind: str,
lines: list[str],
depth: int,
delimiter: str,
) -> None:
"""Emit the first field of a list-item object on the `- ` line."""
ek = _encode_key(key, delimiter)
ds = _delim_sym(delimiter)
if isinstance(value, dict) and value:
# Nested non-empty object
lines.append(f"{ind}- {ek}:")
self._encode_object(value, lines, depth + 2, delimiter)
elif isinstance(value, dict) and not value:
lines.append(f"{ind}- {ek}:")
elif isinstance(value, list):
self._emit_list_array_field(key, value, ind, lines, depth, delimiter)
else:
ev = _encode_value(value, delimiter, is_object_value=True)
lines.append(f"{ind}- {ek}: {ev}")
def _emit_list_array_field(
self,
key: str,
arr: list,
ind: str,
lines: list[str],
depth: int,
delimiter: str,
) -> None:
"""Emit an array field on the `- ` line of a list item."""
ek = _encode_key(key, delimiter)
ds = _delim_sym(delimiter)
if not arr:
lines.append(f"{ind}- {ek}[0{ds}]:")
return
if all(_is_primitive(x) for x in arr):
vals = delimiter.join(_encode_value(x, delimiter) for x in arr)
lines.append(f"{ind}- {ek}[{len(arr)}{ds}]: {vals}")
return
if all(isinstance(x, list) for x in arr):
lines.append(f"{ind}- {ek}[{len(arr)}{ds}]:")
for item in arr:
self._encode_nested_array_item(item, lines, depth + 2, delimiter)
return
# Array of objects or mixed
lines.append(f"{ind}- {ek}[{len(arr)}{ds}]:")
for item in arr:
self._encode_list_item(item, lines, depth + 2, delimiter)
def _emit_list_sibling(
self, key: str, value: Any, lines: list[str], depth: int, delimiter: str
) -> None:
"""Emit a sibling field (not first) in a list-item object at depth+1."""
ind = _indent_str(depth + 1, self.indent)
ek = _encode_key(key, delimiter)
ds = _delim_sym(delimiter)
if isinstance(value, dict) and value:
lines.append(f"{ind}{ek}:")
self._encode_object(value, lines, depth + 2, delimiter)
elif isinstance(value, dict) and not value:
lines.append(f"{ind}{ek}:")
elif isinstance(value, list):
if not value:
lines.append(f"{ind}{ek}[0{ds}]:")
elif all(_is_primitive(x) for x in value):
vals = delimiter.join(_encode_value(x, delimiter) for x in value)
lines.append(f"{ind}{ek}[{len(value)}{ds}]: {vals}")
elif all(isinstance(x, list) for x in value):
lines.append(f"{ind}{ek}[{len(value)}{ds}]:")
for item in value:
self._encode_nested_array_item(item, lines, depth + 2, delimiter)
else:
lines.append(f"{ind}{ek}[{len(value)}{ds}]:")
for item in value:
self._encode_list_item(item, lines, depth + 2, delimiter)
else:
ev = _encode_value(value, delimiter, is_object_value=True)
lines.append(f"{ind}{ek}: {ev}")
def _try_tabular(
self, arr: list[dict], delimiter: str
) -> tuple[str, list[str]] | None:
"""
Try to encode an array of dicts as a tabular (CSV-like) block.
Returns (fields_str, [row_str, ...]) if all objects are uniform
with the same keys and all values are primitives. Returns None
if the array isn't eligible for tabular encoding.
"""
if not arr:
return None
# Get field order from first object
first_keys = list(arr[0].keys())
if not first_keys:
return None # empty objects → list format
# Check uniformity and primitive-only values
for obj in arr:
if set(obj.keys()) != set(first_keys):
return None
for v in obj.values():
if not _is_primitive(v):
return None
fields_str = delimiter.join(
_encode_field_name(f, delimiter) for f in first_keys
)
rows = []
for obj in arr:
vals = delimiter.join(
_encode_value(obj[f], delimiter) for f in first_keys
)
rows.append(vals)
return fields_str, rows
# ── Module-level helpers ─────────────────────────────────────────────────────
def _is_primitive(v: Any) -> bool:
"""Check if a value is a JSON primitive."""
return v is None or isinstance(v, (bool, int, float, str))
# ── Public API ───────────────────────────────────────────────────────────────
def toon(data: Any, *, indent: int = 2, delimiter: str = ",") -> str:
"""
Convert a JSON-compatible Python object to TOON format.
Args:
data: A JSON-serializable Python value (dict, list, str, int, float, bool, None).
indent: Spaces per indentation level (default 2).
delimiter: Array delimiter — ',', '\\t', or '|' (default ',').
Returns:
A TOON-formatted string.
Examples:
>>> print(toon({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2}))
users[2]{id,name}:
1,Alice
2,Bob
count: 2
>>> print(toon({"tags": ["a", "b", "c"]}))
tags[3]: a,b,c
"""
enc = TOONEncoder(indent=indent, delimiter=delimiter)
return enc.encode(data)
def json_to_toon(json_str: str, *, indent: int = 2, delimiter: str = ",") -> str:
"""
Parse a JSON string and convert it to TOON format.
Args:
json_str: A valid JSON string.
indent: Spaces per indentation level (default 2).
delimiter: Array delimiter — ',', '\\t', or '|' (default ',').
Returns:
A TOON-formatted string.
"""
data = json.loads(json_str)
return toon(data, indent=indent, delimiter=delimiter)
# ── CLI ──────────────────────────────────────────────────────────────────────
def _main() -> None:
"""CLI entry point: reads JSON from stdin or a file argument, prints TOON."""
import argparse
parser = argparse.ArgumentParser(
description="Convert JSON to TOON (Token-Oriented Object Notation)."
)
parser.add_argument("file", nargs="?", help="JSON file (reads stdin if omitted)")
parser.add_argument(
"-i", "--indent", type=int, default=2, help="Spaces per indent level (default: 2)"
)
parser.add_argument(
"-d",
"--delimiter",
choices=[",", "tab", "pipe"],
default=",",
help="Array delimiter (default: comma)",
)
args = parser.parse_args()
delim = {",": ",", "tab": "\t", "pipe": "|"}[args.delimiter]
if args.file:
with open(args.file) as f:
data = json.load(f)
else:
data = json.load(sys.stdin)
print(toon(data, indent=args.indent, delimiter=delim))
if __name__ == "__main__":
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment