Created
May 15, 2026 15:11
-
-
Save Red5d/79c35dbe989ab70b7e3c759796296f94 to your computer and use it in GitHub Desktop.
Converts from JSON format to TOON format. Implements the spec from: https://github.com/toon-format/spec/blob/main/SPEC.md
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| json2toon — Convert JSON data to TOON (Token-Oriented Object Notation) format. | |
| TOON is a compact, human-readable encoding of the JSON data model that | |
| minimizes tokens for LLM prompts. It combines YAML-like indentation for | |
| nested objects with CSV-like tabular layout for uniform arrays. | |
| Spec: https://github.com/toon-format/spec/blob/main/SPEC.md (v3.0) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import math | |
| import re | |
| import sys | |
| from typing import Any | |
| # ── Constants ──────────────────────────────────────────────────────────────── | |
| _UNQUOTED_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_.]*$") | |
| _NUMERIC_RE = re.compile(r"^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$") | |
| _VALID_ESCAPES = {"\\\\", '\\"', "\\n", "\\r", "\\t"} | |
| # ── Helpers ────────────────────────────────────────────────────────────────── | |
| def _indent_str(depth: int, indent: int) -> str: | |
| """Return leading spaces for a given depth.""" | |
| return " " * (depth * indent) | |
| def _delim_sym(delimiter: str) -> str: | |
| """Return the delimiter symbol for use inside bracket headers.""" | |
| if delimiter == "\t": | |
| return "\t" | |
| if delimiter == "|": | |
| return "|" | |
| return "" # comma is default — omitted | |
| def _canon_number(v: int | float) -> str: | |
| """Format a number in TOON canonical form (no exponent, no trailing zeros).""" | |
| if isinstance(v, bool): # bool is subclass of int in Python | |
| return "true" if v else "false" | |
| if isinstance(v, int): | |
| if v == 0 and math.copysign(1, v) < 0: | |
| return "0" # -0 → 0 | |
| return str(v) | |
| # float | |
| if math.isnan(v) or math.isinf(v): | |
| return "null" | |
| if v == 0.0 and math.copysign(1, v) < 0: | |
| return "0" # -0.0 → 0 | |
| # Use repr for full precision, then normalize | |
| s = repr(v) | |
| # If repr gave us scientific notation, convert to decimal | |
| if "e" in s.lower(): | |
| # Format with enough precision to round-trip | |
| s = f"{v:.17g}" | |
| if "e" in s.lower(): | |
| # Still scientific — force decimal | |
| d = abs(v) | |
| if d >= 1: | |
| s = f"{v:.0f}" | |
| else: | |
| # Count leading zeros after decimal | |
| frac = f"{v:.17f}" | |
| # Strip trailing zeros | |
| frac = frac.rstrip("0").rstrip(".") | |
| s = frac | |
| # Ensure no exponent in final output | |
| if "e" in s.lower(): | |
| s = f"{float(s):.0f}" if abs(float(s)) >= 1 else f"{float(s):.17f}".rstrip("0").rstrip(".") | |
| # Strip trailing zeros after decimal point | |
| if "." in s: | |
| s = s.rstrip("0").rstrip(".") | |
| # Handle -0 | |
| if s == "-0": | |
| s = "0" | |
| return s | |
| def _is_safe_unquoted_key(s: str) -> bool: | |
| """Check if a key can appear unquoted (per §7.3).""" | |
| return bool(_UNQUOTED_KEY_RE.match(s)) | |
| def _needs_quoting_value(s: str, delimiter: str, *, is_object_value: bool = False) -> bool: | |
| """ | |
| Determine if a string value needs quoting. | |
| Args: | |
| s: The string value. | |
| delimiter: The active delimiter (comma, tab, or pipe). | |
| is_object_value: True if this is a value in a key: value pair | |
| (adds colon and comma quoting). | |
| """ | |
| if s == "": | |
| return True | |
| # Reserved literals that would be misinterpreted | |
| if s in ("true", "false", "null"): | |
| return True | |
| # Numeric-looking strings | |
| if _NUMERIC_RE.match(s): | |
| return True | |
| # Leading zeros as string (e.g., "05", "0001") | |
| if re.match(r"^-?0\d+", s): | |
| return True | |
| # Leading or trailing whitespace requires quoting | |
| if s != s.strip(): | |
| return True | |
| # Control characters (newline, tab, carriage return) | |
| if any(c in s for c in "\n\r\t"): | |
| return True | |
| # Structural characters | |
| if any(c in s for c in "[]{}"): | |
| return True | |
| # Backslash (only valid inside quoted strings as escape introducer) | |
| if "\\" in s: | |
| return True | |
| # Starts with hyphen-space (looks like list item) | |
| if s.startswith("- "): | |
| return True | |
| # Single hyphen (ambiguous with list item marker in any context) | |
| if s == "-": | |
| return True | |
| # Active delimiter | |
| if delimiter in s: | |
| return True | |
| # Colon (can be confused with key: value syntax in some parsing contexts) | |
| if ":" in s: | |
| return True | |
| # Double quote (needs escaping) | |
| if '"' in s: | |
| return True | |
| # Comma — only quote if comma is NOT the active delimiter | |
| # (when comma IS the delimiter, commas in values are handled by quoting; | |
| # when a different delimiter is active, commas are just regular characters) | |
| # Actually: commas in values are only special when comma is the delimiter | |
| # No additional rule needed — delimiter check above handles it. | |
| # Object-value-specific rules (kept for clarity, most handled above) | |
| if is_object_value: | |
| pass # colon and comma already handled above | |
| return False | |
| def _quote_string(s: str) -> str: | |
| """Quote and escape a string value.""" | |
| escaped = ( | |
| s.replace("\\", "\\\\") | |
| .replace('"', '\\"') | |
| .replace("\n", "\\n") | |
| .replace("\r", "\\r") | |
| .replace("\t", "\\t") | |
| ) | |
| return f'"{escaped}"' | |
| def _encode_value(v: Any, delimiter: str, *, is_object_value: bool = False) -> str: | |
| """Encode a primitive value as a TOON token string.""" | |
| if v is None: | |
| return "null" | |
| if isinstance(v, bool): | |
| return "true" if v else "false" | |
| if isinstance(v, (int, float)): | |
| return _canon_number(v) | |
| if isinstance(v, str): | |
| if _needs_quoting_value(v, delimiter, is_object_value=is_object_value): | |
| return _quote_string(v) | |
| return v | |
| # Fallback for unexpected types | |
| return _quote_string(str(v)) | |
| def _encode_key(key: str, delimiter: str) -> str: | |
| """Encode an object key, quoting if necessary.""" | |
| if _is_safe_unquoted_key(key): | |
| return key | |
| return _quote_string(key) | |
| def _encode_field_name(name: str, delimiter: str) -> str: | |
| """Encode a field name in a tabular header (same rules as key).""" | |
| return _encode_key(name, delimiter) | |
| # ── Core encoder ───────────────────────────────────────────────────────────── | |
| class TOONEncoder: | |
| """ | |
| Convert JSON-compatible Python objects to TOON format strings. | |
| Args: | |
| indent: Number of spaces per indentation level (default 2). | |
| delimiter: Delimiter for array values — ',', '\\t', or '|' (default ','). | |
| """ | |
| def __init__(self, indent: int = 2, delimiter: str = ","): | |
| self.indent = indent | |
| self.delimiter = delimiter | |
| def encode(self, data: Any) -> str: | |
| """Encode a JSON-compatible Python value to TOON format.""" | |
| lines: list[str] = [] | |
| self._encode_root(data, lines, delimiter=self.delimiter) | |
| return "\n".join(lines) | |
| def _encode_root(self, data: Any, lines: list[str], delimiter: str) -> None: | |
| """Encode at root level (depth 0).""" | |
| if isinstance(data, dict): | |
| self._encode_object(data, lines, depth=0, delimiter=delimiter) | |
| elif isinstance(data, list): | |
| self._encode_root_array(data, lines, delimiter=delimiter) | |
| else: | |
| # Single primitive at root | |
| lines.append(_encode_value(data, delimiter)) | |
| def _encode_root_array(self, arr: list, lines: list[str], delimiter: str) -> None: | |
| """Encode a root-level array.""" | |
| ds = _delim_sym(delimiter) | |
| if not arr: | |
| lines.append(f"[0{ds}]:") | |
| return | |
| # Check if all items are primitives | |
| if all(_is_primitive(x) for x in arr): | |
| vals = delimiter.join(_encode_value(x, delimiter) for x in arr) | |
| lines.append(f"[{len(arr)}{ds}]: {vals}") | |
| return | |
| # Check if all items are dicts with same primitive-only keys (tabular) | |
| if all(isinstance(x, dict) for x in arr): | |
| tab = self._try_tabular(arr, delimiter) | |
| if tab: | |
| fields_str, rows = tab | |
| lines.append(f"[{len(arr)}{ds}]{{{fields_str}}}:") | |
| for row in rows: | |
| lines.append(f" {row}") | |
| return | |
| # List format | |
| lines.append(f"[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_list_item(item, lines, depth=1, delimiter=delimiter) | |
| def _encode_object( | |
| self, obj: dict, lines: list[str], depth: int, delimiter: str | |
| ) -> None: | |
| """Encode an object at the given depth.""" | |
| ind = _indent_str(depth, self.indent) | |
| for key, value in obj.items(): | |
| ek = _encode_key(key, delimiter) | |
| if isinstance(value, dict) and value: | |
| # Nested non-empty object: key: then recurse | |
| lines.append(f"{ind}{ek}:") | |
| self._encode_object(value, lines, depth + 1, delimiter) | |
| elif isinstance(value, list): | |
| self._encode_array_field(ek, value, lines, depth, delimiter) | |
| elif isinstance(value, dict) and not value: | |
| # Empty object | |
| lines.append(f"{ind}{ek}:") | |
| else: | |
| # Primitive value | |
| ev = _encode_value(value, delimiter, is_object_value=True) | |
| lines.append(f"{ind}{ek}: {ev}") | |
| def _encode_array_field( | |
| self, | |
| key_str: str, | |
| arr: list, | |
| lines: list[str], | |
| depth: int, | |
| delimiter: str, | |
| ) -> None: | |
| """Encode an array as a field inside an object.""" | |
| ind = _indent_str(depth, self.indent) | |
| ds = _delim_sym(delimiter) | |
| if not arr: | |
| lines.append(f"{ind}{key_str}[0{ds}]:") | |
| return | |
| # All primitives → inline | |
| if all(_is_primitive(x) for x in arr): | |
| vals = delimiter.join(_encode_value(x, delimiter) for x in arr) | |
| lines.append(f"{ind}{key_str}[{len(arr)}{ds}]: {vals}") | |
| return | |
| # All dicts → try tabular, else list | |
| if all(isinstance(x, dict) for x in arr): | |
| tab = self._try_tabular(arr, delimiter) | |
| if tab: | |
| fields_str, rows = tab | |
| lines.append(f"{ind}{key_str}[{len(arr)}{ds}]{{{fields_str}}}:") | |
| for row in rows: | |
| lines.append(f"{ind} {row}") | |
| return | |
| # List of objects | |
| lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_list_item(item, lines, depth + 1, delimiter) | |
| return | |
| # All arrays (nested arrays) → list of arrays | |
| if all(isinstance(x, list) for x in arr): | |
| lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_nested_array_item(item, lines, depth + 1, delimiter) | |
| return | |
| # Mixed types → list format | |
| lines.append(f"{ind}{key_str}[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_list_item(item, lines, depth + 1, delimiter) | |
| def _encode_nested_array_item( | |
| self, arr: list, lines: list[str], depth: int, delimiter: str | |
| ) -> None: | |
| """Encode a nested array as a list item: `- [N]: v1,v2`.""" | |
| ind = _indent_str(depth, self.indent) | |
| ds = _delim_sym(delimiter) | |
| if not arr: | |
| lines.append(f"{ind}- [0{ds}]:") | |
| return | |
| if all(_is_primitive(x) for x in arr): | |
| vals = delimiter.join(_encode_value(x, delimiter) for x in arr) | |
| lines.append(f"{ind}- [{len(arr)}{ds}]: {vals}") | |
| return | |
| # Deeper nesting — recurse as list items under this array header | |
| lines.append(f"{ind}- [{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_list_item(item, lines, depth + 1, delimiter) | |
| def _encode_list_item( | |
| self, item: Any, lines: list[str], depth: int, delimiter: str | |
| ) -> None: | |
| """Encode a single item in a list (expanded) array.""" | |
| ind = _indent_str(depth, self.indent) | |
| if _is_primitive(item): | |
| ev = _encode_value(item, delimiter) | |
| lines.append(f"{ind}- {ev}") | |
| return | |
| if isinstance(item, dict): | |
| if not item: | |
| # Empty object → bare hyphen | |
| lines.append(f"{ind}-") | |
| return | |
| self._encode_list_object_item(item, lines, depth, delimiter) | |
| return | |
| if isinstance(item, list): | |
| self._encode_nested_array_item(item, lines, depth, delimiter) | |
| return | |
| def _encode_list_object_item( | |
| self, obj: dict, lines: list[str], depth: int, delimiter: str | |
| ) -> None: | |
| """ | |
| Encode an object as a list item. | |
| Per §10: | |
| - First field on the `- ` line (key: value, or array header). | |
| - Subsequent fields at depth+1. | |
| - Special case: if first field is a tabular array, the tabular | |
| header goes on the hyphen line and rows go at depth+2. | |
| - Single-field tabular arrays also use the hyphen-line header. | |
| - Multi-field objects where first field is a tabular array: | |
| header on hyphen line, rows at depth+2, other fields at depth+1. | |
| """ | |
| ind = _indent_str(depth, self.indent) | |
| keys = list(obj.keys()) | |
| if len(keys) == 1: | |
| k = keys[0] | |
| v = obj[k] | |
| # Check if the single field is a tabular array | |
| if isinstance(v, list) and v and all(isinstance(x, dict) for x in v): | |
| tab = self._try_tabular(v, delimiter) | |
| if tab: | |
| ek = _encode_key(k, delimiter) | |
| ds = _delim_sym(delimiter) | |
| fields_str, rows = tab | |
| lines.append(f"{ind}- {ek}[{len(v)}{ds}]{{{fields_str}}}:") | |
| for row in rows: | |
| lines.append(f"{ind} {row}") | |
| return | |
| self._emit_first_list_field(k, v, ind, lines, depth, delimiter) | |
| return | |
| # Multi-field object | |
| first_key = keys[0] | |
| first_val = obj[first_key] | |
| # Check if first field is a tabular array | |
| if isinstance(first_val, list) and first_val and all(isinstance(x, dict) for x in first_val): | |
| tab = self._try_tabular(first_val, delimiter) | |
| if tab: | |
| ek = _encode_key(first_key, delimiter) | |
| ds = _delim_sym(delimiter) | |
| fields_str, rows = tab | |
| lines.append(f"{ind}- {ek}[{len(first_val)}{ds}]{{{fields_str}}}:") | |
| for row in rows: | |
| lines.append(f"{ind} {row}") | |
| # Remaining fields at depth+1 | |
| for k in keys[1:]: | |
| v = obj[k] | |
| self._emit_list_sibling(k, v, lines, depth, delimiter) | |
| return | |
| # Regular multi-field: first field on hyphen line, rest at depth+1 | |
| self._emit_first_list_field(first_key, first_val, ind, lines, depth, delimiter) | |
| for k in keys[1:]: | |
| v = obj[k] | |
| self._emit_list_sibling(k, v, lines, depth, delimiter) | |
| def _emit_first_list_field( | |
| self, | |
| key: str, | |
| value: Any, | |
| ind: str, | |
| lines: list[str], | |
| depth: int, | |
| delimiter: str, | |
| ) -> None: | |
| """Emit the first field of a list-item object on the `- ` line.""" | |
| ek = _encode_key(key, delimiter) | |
| ds = _delim_sym(delimiter) | |
| if isinstance(value, dict) and value: | |
| # Nested non-empty object | |
| lines.append(f"{ind}- {ek}:") | |
| self._encode_object(value, lines, depth + 2, delimiter) | |
| elif isinstance(value, dict) and not value: | |
| lines.append(f"{ind}- {ek}:") | |
| elif isinstance(value, list): | |
| self._emit_list_array_field(key, value, ind, lines, depth, delimiter) | |
| else: | |
| ev = _encode_value(value, delimiter, is_object_value=True) | |
| lines.append(f"{ind}- {ek}: {ev}") | |
| def _emit_list_array_field( | |
| self, | |
| key: str, | |
| arr: list, | |
| ind: str, | |
| lines: list[str], | |
| depth: int, | |
| delimiter: str, | |
| ) -> None: | |
| """Emit an array field on the `- ` line of a list item.""" | |
| ek = _encode_key(key, delimiter) | |
| ds = _delim_sym(delimiter) | |
| if not arr: | |
| lines.append(f"{ind}- {ek}[0{ds}]:") | |
| return | |
| if all(_is_primitive(x) for x in arr): | |
| vals = delimiter.join(_encode_value(x, delimiter) for x in arr) | |
| lines.append(f"{ind}- {ek}[{len(arr)}{ds}]: {vals}") | |
| return | |
| if all(isinstance(x, list) for x in arr): | |
| lines.append(f"{ind}- {ek}[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_nested_array_item(item, lines, depth + 2, delimiter) | |
| return | |
| # Array of objects or mixed | |
| lines.append(f"{ind}- {ek}[{len(arr)}{ds}]:") | |
| for item in arr: | |
| self._encode_list_item(item, lines, depth + 2, delimiter) | |
| def _emit_list_sibling( | |
| self, key: str, value: Any, lines: list[str], depth: int, delimiter: str | |
| ) -> None: | |
| """Emit a sibling field (not first) in a list-item object at depth+1.""" | |
| ind = _indent_str(depth + 1, self.indent) | |
| ek = _encode_key(key, delimiter) | |
| ds = _delim_sym(delimiter) | |
| if isinstance(value, dict) and value: | |
| lines.append(f"{ind}{ek}:") | |
| self._encode_object(value, lines, depth + 2, delimiter) | |
| elif isinstance(value, dict) and not value: | |
| lines.append(f"{ind}{ek}:") | |
| elif isinstance(value, list): | |
| if not value: | |
| lines.append(f"{ind}{ek}[0{ds}]:") | |
| elif all(_is_primitive(x) for x in value): | |
| vals = delimiter.join(_encode_value(x, delimiter) for x in value) | |
| lines.append(f"{ind}{ek}[{len(value)}{ds}]: {vals}") | |
| elif all(isinstance(x, list) for x in value): | |
| lines.append(f"{ind}{ek}[{len(value)}{ds}]:") | |
| for item in value: | |
| self._encode_nested_array_item(item, lines, depth + 2, delimiter) | |
| else: | |
| lines.append(f"{ind}{ek}[{len(value)}{ds}]:") | |
| for item in value: | |
| self._encode_list_item(item, lines, depth + 2, delimiter) | |
| else: | |
| ev = _encode_value(value, delimiter, is_object_value=True) | |
| lines.append(f"{ind}{ek}: {ev}") | |
| def _try_tabular( | |
| self, arr: list[dict], delimiter: str | |
| ) -> tuple[str, list[str]] | None: | |
| """ | |
| Try to encode an array of dicts as a tabular (CSV-like) block. | |
| Returns (fields_str, [row_str, ...]) if all objects are uniform | |
| with the same keys and all values are primitives. Returns None | |
| if the array isn't eligible for tabular encoding. | |
| """ | |
| if not arr: | |
| return None | |
| # Get field order from first object | |
| first_keys = list(arr[0].keys()) | |
| if not first_keys: | |
| return None # empty objects → list format | |
| # Check uniformity and primitive-only values | |
| for obj in arr: | |
| if set(obj.keys()) != set(first_keys): | |
| return None | |
| for v in obj.values(): | |
| if not _is_primitive(v): | |
| return None | |
| fields_str = delimiter.join( | |
| _encode_field_name(f, delimiter) for f in first_keys | |
| ) | |
| rows = [] | |
| for obj in arr: | |
| vals = delimiter.join( | |
| _encode_value(obj[f], delimiter) for f in first_keys | |
| ) | |
| rows.append(vals) | |
| return fields_str, rows | |
| # ── Module-level helpers ───────────────────────────────────────────────────── | |
| def _is_primitive(v: Any) -> bool: | |
| """Check if a value is a JSON primitive.""" | |
| return v is None or isinstance(v, (bool, int, float, str)) | |
| # ── Public API ─────────────────────────────────────────────────────────────── | |
| def toon(data: Any, *, indent: int = 2, delimiter: str = ",") -> str: | |
| """ | |
| Convert a JSON-compatible Python object to TOON format. | |
| Args: | |
| data: A JSON-serializable Python value (dict, list, str, int, float, bool, None). | |
| indent: Spaces per indentation level (default 2). | |
| delimiter: Array delimiter — ',', '\\t', or '|' (default ','). | |
| Returns: | |
| A TOON-formatted string. | |
| Examples: | |
| >>> print(toon({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2})) | |
| users[2]{id,name}: | |
| 1,Alice | |
| 2,Bob | |
| count: 2 | |
| >>> print(toon({"tags": ["a", "b", "c"]})) | |
| tags[3]: a,b,c | |
| """ | |
| enc = TOONEncoder(indent=indent, delimiter=delimiter) | |
| return enc.encode(data) | |
| def json_to_toon(json_str: str, *, indent: int = 2, delimiter: str = ",") -> str: | |
| """ | |
| Parse a JSON string and convert it to TOON format. | |
| Args: | |
| json_str: A valid JSON string. | |
| indent: Spaces per indentation level (default 2). | |
| delimiter: Array delimiter — ',', '\\t', or '|' (default ','). | |
| Returns: | |
| A TOON-formatted string. | |
| """ | |
| data = json.loads(json_str) | |
| return toon(data, indent=indent, delimiter=delimiter) | |
| # ── CLI ────────────────────────────────────────────────────────────────────── | |
| def _main() -> None: | |
| """CLI entry point: reads JSON from stdin or a file argument, prints TOON.""" | |
| import argparse | |
| parser = argparse.ArgumentParser( | |
| description="Convert JSON to TOON (Token-Oriented Object Notation)." | |
| ) | |
| parser.add_argument("file", nargs="?", help="JSON file (reads stdin if omitted)") | |
| parser.add_argument( | |
| "-i", "--indent", type=int, default=2, help="Spaces per indent level (default: 2)" | |
| ) | |
| parser.add_argument( | |
| "-d", | |
| "--delimiter", | |
| choices=[",", "tab", "pipe"], | |
| default=",", | |
| help="Array delimiter (default: comma)", | |
| ) | |
| args = parser.parse_args() | |
| delim = {",": ",", "tab": "\t", "pipe": "|"}[args.delimiter] | |
| if args.file: | |
| with open(args.file) as f: | |
| data = json.load(f) | |
| else: | |
| data = json.load(sys.stdin) | |
| print(toon(data, indent=args.indent, delimiter=delim)) | |
| if __name__ == "__main__": | |
| _main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment