calraith · March 7, 2021 05:56 · calraith · Jun 1, 2018
diff --git a/json.awk b/json.awk
 #!/usr/bin/gawk -f

 # Example usage: LC_ALL=C ./json.awk datafile.json

 { json = json $0 }

 END {
 	# Build "obj" as a true multidimensional array from string data
 	deserialize(json, obj)

 	# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0.
 	# print obj[1]["name"]

 	# Stringify a multidimensional array. Indent with tabs.
 	if (isarray(obj)) print serialize(obj, "\t")
 }

 # === FUNCTIONS ===

 function join(arr, sep, _p, i) {
 	# syntax: join(array, string separator)
 	# returns a string

 	for (i in arr) {
 		_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
 	}
 	return _p["result"]
 }

 function quote(str) {
 	gsub(/\\/, "\\\\", str)
 	gsub(/\r/, "\\r", str)
 	gsub(/\n/, "\\n", str)
 	gsub(/\t/, "\\t", str)
 	return "\"" str "\""
 }

 function serialize(arr, indent_with, depth, _p, i, idx) {
 	# syntax: serialize(array of arrays, indent string)
 	# returns a JSON formatted string

 	# sort arrays on key, ensures [...] values remain properly ordered
 	if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc"

 	# determine whether array is indexed or associative
 	for (i in arr) {
 		_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
 	}

 	# if associative, indent
 	if (_p["assoc"]) {
 		for (i = ++depth; i--;) {
 			_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
 		}
 	}

 	for (i in arr) {
 		# If key length is 0, assume its an empty object
 		if (!length(i)) return "{}"

 		# quote key if not already quoted
 		_p["key"] = i !~ /^".*"$/ ? quote(i) : i

 		if (isarray(arr[i])) {
 			if (_p["assoc"]) {
 				_p["json"][++idx] = _p["indent"] _p["key"] ": " \
 					serialize(arr[i], indent_with, depth)
 			} else {
 				# if indexed array, dont print keys
 				_p["json"][++idx] = serialize(arr[i], indent_with, depth)
 			}
 		} else {
 			# quote if not numeric, boolean, null, already quoted, or too big for match()
 			if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) ||
 				arr[i] ~ /^true|false|null|".*"$/) || length(arr[i]) > 1000)
 				arr[i] = quote(arr[i])

 			_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
 		}
 	}

 	# I trial and errored the hell out of this. Problem is, gawk can't distinguish between
 	# a value of null and no value.  I think this hack is as close as I can get, although
 	# [""] will become [].
 	if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"

 	# surround with curly braces if object, square brackets if array
 	return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
 		: "[" join(_p["json"], ", ") "]"
 }

 function trim(str) { gsub(/^\s+|\s+$/, "", str); return str }

 function unquote(str) {
 	gsub(/^'|'$/, "", str)
 	gsub(/^"|"$/, "", str)
 	gsub(/\\r/, "\r", str)
 	gsub(/\\n/, "\n", str)
 	gsub(/\\t/, "\t", str)
 	gsub(/\\{2}/, "\\", str)
 	return trim(str)
 }

 function tokenize(str, arr, splitchar, _p) {
 	# syntax: tokenize(JSON-formatted string, array to populate, char to split on)
 	# arr populates with matches split on unbracketed, unquoted splitchar
 	# returns length of arr
 	# This function supplants fpat / patsplit since those methods cannot reliably group
 	# mated bracket pairs

 	while (++_p["pos"] <= length(str)) {

 		_p["char"] = substr(str, _p["pos"], 1)

 		switch (_p["char"]) {
 			case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break
 			case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break
 			case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break
 			case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break
 			case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break
 			case "\\": _p["\\"] = !_p["\\"]; break
 			default: _p["\\"] = false
 		}

 		if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) {
 			arr[++_p["idx"]] = trim(_p["segment"])
 			delete _p["segment"]
 		} else {
 			_p["segment"] = _p["segment"] _p["char"]
 		}
 	}
 	arr[++_p["idx"]] = trim(_p["segment"])
 	return _p["idx"]
 }

 function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) {
 	# syntax: deserialize (JSON-formatted string, array to populate)
 	# Resulting array is true multidimensional (arr[idx][idx][etc...])
 	# ... not concatenated index (arr[idx,idx,etc...])

 	# consume outer brackets / braces
 	# note: match() failed here with very large JSON data
 	json = trim(json)
 	_parts[1] = substr(json, 1, 1)
 	_parts[2] = substr(json, 2, length(json) - 2)

 	# split on unbracketed, unquoted commas
 	_p["outie"] = tokenize(trim(_parts[2]), _values, ",")

 	for (i = 1; i <= _p["outie"]; i++) {

 		# build associative array
 		if (_parts[1] ~ "{") {

 			# split on unbracketed, unquoted colons
 			_p["innie"] = tokenize(trim(_values[i]), _keyval, ":")

 			for (j=1; j<=_p["innie"]; j+=2) {

 				# if value begins with a bracket or brace, recurse
 				if (trim(_keyval[j+1]) ~ /^[\[\{]/) {

 					# init array element as explicit array (defaults to scalar without this)
 					arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0]

 					# do recurse
 					deserialize(_keyval[j+1], arr[unquote(_keyval[j])])
 				} else {
 					arr[unquote(_keyval[j])] = unquote(_keyval[j+1])
 				}
 			}

 		# build numerically indexed array
 		} else {

 			while (++_p["idx"] in arr) {}

 			# if value begins with a bracket or brace, recurse
 			if (trim(_values[i]) ~ /^[\[\{]/) {

 				# init array element as explicit array (defaults to scalar without this)
 				arr[_p["idx"]][0]; delete arr[_p["idx"]][0]

 				# do recurse
 				deserialize(trim(_values[i]), arr[_p["idx"]])
 			} else {
 				arr[_p["idx"]] = unquote(_values[i])
 			}
 		}
 	}
 }
	#!/usr/bin/gawk -f

	# Example usage: LC_ALL=C ./json.awk datafile.json

	{ json = json $0 }

	END {
	# Build "obj" as a true multidimensional array from string data
	deserialize(json, obj)

	# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0.
	# print obj[1]["name"]

	# Stringify a multidimensional array. Indent with tabs.
	if (isarray(obj)) print serialize(obj, "\t")
	}

	# === FUNCTIONS ===

	function join(arr, sep, _p, i) {
	# syntax: join(array, string separator)
	# returns a string

	for (i in arr) {
	_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
	}
	return _p["result"]
	}

	function quote(str) {
	gsub(/\\/, "\\\\", str)
	gsub(/\r/, "\\r", str)
	gsub(/\n/, "\\n", str)
	gsub(/\t/, "\\t", str)
	return "\"" str "\""
	}

	function serialize(arr, indent_with, depth, _p, i, idx) {
	# syntax: serialize(array of arrays, indent string)
	# returns a JSON formatted string

	# sort arrays on key, ensures [...] values remain properly ordered
	if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc"

	# determine whether array is indexed or associative
	for (i in arr) {
	_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
	}

	# if associative, indent
	if (_p["assoc"]) {
	for (i = ++depth; i--;) {
	_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
	}
	}

	for (i in arr) {
	# If key length is 0, assume its an empty object
	if (!length(i)) return "{}"

	# quote key if not already quoted
	_p["key"] = i !~ /^".*"$/ ? quote(i) : i

	if (isarray(arr[i])) {
	if (_p["assoc"]) {
	_p["json"][++idx] = _p["indent"] _p["key"] ": " \
	serialize(arr[i], indent_with, depth)
	} else {
	# if indexed array, dont print keys
	_p["json"][++idx] = serialize(arr[i], indent_with, depth)
	}
	} else {
	# quote if not numeric, boolean, null, already quoted, or too big for match()
	if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) \|\|
	arr[i] ~ /^true\|false\|null\|".*"$/) \|\| length(arr[i]) > 1000)
	arr[i] = quote(arr[i])

	_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
	}
	}

	# I trial and errored the hell out of this. Problem is, gawk can't distinguish between
	# a value of null and no value. I think this hack is as close as I can get, although
	# [""] will become [].
	if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"

	# surround with curly braces if object, square brackets if array
	return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
	: "[" join(_p["json"], ", ") "]"
	}

	function trim(str) { gsub(/^\s+\|\s+$/, "", str); return str }

	function unquote(str) {
	gsub(/^'\|'$/, "", str)
	gsub(/^"\|"$/, "", str)
	gsub(/\\r/, "\r", str)
	gsub(/\\n/, "\n", str)
	gsub(/\\t/, "\t", str)
	gsub(/\\{2}/, "\\", str)
	return trim(str)
	}

	function tokenize(str, arr, splitchar, _p) {
	# syntax: tokenize(JSON-formatted string, array to populate, char to split on)
	# arr populates with matches split on unbracketed, unquoted splitchar
	# returns length of arr
	# This function supplants fpat / patsplit since those methods cannot reliably group
	# mated bracket pairs

	while (++_p["pos"] <= length(str)) {

	_p["char"] = substr(str, _p["pos"], 1)

	switch (_p["char"]) {
	case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break
	case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break
	case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break
	case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break
	case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break
	case "\\": _p["\\"] = !_p["\\"]; break
	default: _p["\\"] = false
	}

	if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) {
	arr[++_p["idx"]] = trim(_p["segment"])
	delete _p["segment"]
	} else {
	_p["segment"] = _p["segment"] _p["char"]
	}
	}
	arr[++_p["idx"]] = trim(_p["segment"])
	return _p["idx"]
	}

	function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) {
	# syntax: deserialize (JSON-formatted string, array to populate)
	# Resulting array is true multidimensional (arr[idx][idx][etc...])
	# ... not concatenated index (arr[idx,idx,etc...])

	# consume outer brackets / braces
	# note: match() failed here with very large JSON data
	json = trim(json)
	_parts[1] = substr(json, 1, 1)
	_parts[2] = substr(json, 2, length(json) - 2)

	# split on unbracketed, unquoted commas
	_p["outie"] = tokenize(trim(_parts[2]), _values, ",")

	for (i = 1; i <= _p["outie"]; i++) {

	# build associative array
	if (_parts[1] ~ "{") {

	# split on unbracketed, unquoted colons
	_p["innie"] = tokenize(trim(_values[i]), _keyval, ":")

	for (j=1; j<=_p["innie"]; j+=2) {

	# if value begins with a bracket or brace, recurse
	if (trim(_keyval[j+1]) ~ /^[\[\{]/) {

	# init array element as explicit array (defaults to scalar without this)
	arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0]

	# do recurse
	deserialize(_keyval[j+1], arr[unquote(_keyval[j])])
	} else {
	arr[unquote(_keyval[j])] = unquote(_keyval[j+1])
	}
	}

	# build numerically indexed array
	} else {

	while (++_p["idx"] in arr) {}

	# if value begins with a bracket or brace, recurse
	if (trim(_values[i]) ~ /^[\[\{]/) {

	# init array element as explicit array (defaults to scalar without this)
	arr[_p["idx"]][0]; delete arr[_p["idx"]][0]

	# do recurse
	deserialize(trim(_values[i]), arr[_p["idx"]])
	} else {
	arr[_p["idx"]] = unquote(_values[i])
	}
	}
	}
	}