Last active
March 7, 2021 05:56
-
-
Save calraith/5c85d17b28e96f49e067f9b054a6fdbb to your computer and use it in GitHub Desktop.
gawk JSON serializer / deserializer. See the END {...} section for example usage, as well as the discussion below this script for further examples. Read the syntax comments within the function definitions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/gawk -f | |
# Example usage: LC_ALL=C ./json.awk datafile.json | |
{ json = json $0 } | |
END { | |
# Build "obj" as a true multidimensional array from string data | |
deserialize(json, obj) | |
# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0. | |
# print obj[1]["name"] | |
# Stringify a multidimensional array. Indent with tabs. | |
if (isarray(obj)) print serialize(obj, "\t") | |
} | |
# === FUNCTIONS === | |
function join(arr, sep, _p, i) { | |
# syntax: join(array, string separator) | |
# returns a string | |
for (i in arr) { | |
_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i] | |
} | |
return _p["result"] | |
} | |
function quote(str) { | |
gsub(/\\/, "\\\\", str) | |
gsub(/\r/, "\\r", str) | |
gsub(/\n/, "\\n", str) | |
gsub(/\t/, "\\t", str) | |
return "\"" str "\"" | |
} | |
function serialize(arr, indent_with, depth, _p, i, idx) { | |
# syntax: serialize(array of arrays, indent string) | |
# returns a JSON formatted string | |
# sort arrays on key, ensures [...] values remain properly ordered | |
if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc" | |
# determine whether array is indexed or associative | |
for (i in arr) { | |
_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr)) | |
} | |
# if associative, indent | |
if (_p["assoc"]) { | |
for (i = ++depth; i--;) { | |
_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with | |
} | |
} | |
for (i in arr) { | |
# If key length is 0, assume its an empty object | |
if (!length(i)) return "{}" | |
# quote key if not already quoted | |
_p["key"] = i !~ /^".*"$/ ? quote(i) : i | |
if (isarray(arr[i])) { | |
if (_p["assoc"]) { | |
_p["json"][++idx] = _p["indent"] _p["key"] ": " \ | |
serialize(arr[i], indent_with, depth) | |
} else { | |
# if indexed array, dont print keys | |
_p["json"][++idx] = serialize(arr[i], indent_with, depth) | |
} | |
} else { | |
# quote if not numeric, boolean, null, already quoted, or too big for match() | |
if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) || | |
arr[i] ~ /^true|false|null|".*"$/) || length(arr[i]) > 1000) | |
arr[i] = quote(arr[i]) | |
_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i] | |
} | |
} | |
# I trial and errored the hell out of this. Problem is, gawk can't distinguish between | |
# a value of null and no value. I think this hack is as close as I can get, although | |
# [""] will become []. | |
if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]" | |
# surround with curly braces if object, square brackets if array | |
return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \ | |
: "[" join(_p["json"], ", ") "]" | |
} | |
function trim(str) { gsub(/^\s+|\s+$/, "", str); return str } | |
function unquote(str) { | |
gsub(/^'|'$/, "", str) | |
gsub(/^"|"$/, "", str) | |
gsub(/\\r/, "\r", str) | |
gsub(/\\n/, "\n", str) | |
gsub(/\\t/, "\t", str) | |
gsub(/\\{2}/, "\\", str) | |
return trim(str) | |
} | |
function tokenize(str, arr, splitchar, _p) { | |
# syntax: tokenize(JSON-formatted string, array to populate, char to split on) | |
# arr populates with matches split on unbracketed, unquoted splitchar | |
# returns length of arr | |
# This function supplants fpat / patsplit since those methods cannot reliably group | |
# mated bracket pairs | |
while (++_p["pos"] <= length(str)) { | |
_p["char"] = substr(str, _p["pos"], 1) | |
switch (_p["char"]) { | |
case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break | |
case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break | |
case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break | |
case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break | |
case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break | |
case "\\": _p["\\"] = !_p["\\"]; break | |
default: _p["\\"] = false | |
} | |
if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) { | |
arr[++_p["idx"]] = trim(_p["segment"]) | |
delete _p["segment"] | |
} else { | |
_p["segment"] = _p["segment"] _p["char"] | |
} | |
} | |
arr[++_p["idx"]] = trim(_p["segment"]) | |
return _p["idx"] | |
} | |
function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) { | |
# syntax: deserialize (JSON-formatted string, array to populate) | |
# Resulting array is true multidimensional (arr[idx][idx][etc...]) | |
# ... not concatenated index (arr[idx,idx,etc...]) | |
# consume outer brackets / braces | |
# note: match() failed here with very large JSON data | |
json = trim(json) | |
_parts[1] = substr(json, 1, 1) | |
_parts[2] = substr(json, 2, length(json) - 2) | |
# split on unbracketed, unquoted commas | |
_p["outie"] = tokenize(trim(_parts[2]), _values, ",") | |
for (i = 1; i <= _p["outie"]; i++) { | |
# build associative array | |
if (_parts[1] ~ "{") { | |
# split on unbracketed, unquoted colons | |
_p["innie"] = tokenize(trim(_values[i]), _keyval, ":") | |
for (j=1; j<=_p["innie"]; j+=2) { | |
# if value begins with a bracket or brace, recurse | |
if (trim(_keyval[j+1]) ~ /^[\[\{]/) { | |
# init array element as explicit array (defaults to scalar without this) | |
arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0] | |
# do recurse | |
deserialize(_keyval[j+1], arr[unquote(_keyval[j])]) | |
} else { | |
arr[unquote(_keyval[j])] = unquote(_keyval[j+1]) | |
} | |
} | |
# build numerically indexed array | |
} else { | |
while (++_p["idx"] in arr) {} | |
# if value begins with a bracket or brace, recurse | |
if (trim(_values[i]) ~ /^[\[\{]/) { | |
# init array element as explicit array (defaults to scalar without this) | |
arr[_p["idx"]][0]; delete arr[_p["idx"]][0] | |
# do recurse | |
deserialize(trim(_values[i]), arr[_p["idx"]]) | |
} else { | |
arr[_p["idx"]] = unquote(_values[i]) | |
} | |
} | |
} | |
} |
Here's a practical example. Do something like this to translate csv input into JSON output:
BEGIN {
# use field pattern rather than field separator to split input on unquoted commas
FPAT = "([^,]*)|(\"[^\"]+\")"
# read and store the first row as a header row
getline header
# split the header row into column definitions
patsplit(header, cols)
}
# For each line of input....
{
# For each field...
for (i = 1; i <= NF; i++) {
# obj["results"][row][key] = value
obj["results"][NR - 1][cols[i]] = $i
}
}
END { print serialize(obj, "\t") }
Or you can translate an INI file into JSON:
# split on unquoted equals and whitespace
BEGIN { FPAT = "([^=[:space:]]*)|(\"[^\"]+\")" }
# remove carriage returns
{ gsub(/\r/, "") }
# match non-blank, non-comment lines
$1 ~ /^[^#;]/ {
switch ($0) {
# match [str...]
case /^\[.+\]/:
section = gensub(/[\[\]]/, "", "g")
break
# match / skip lines not containing =
case /^[^=]*$/: break
default:
if (NF > 2) {
# if value contains multiple tokens, create array
for (i = 2; i <= NF; i++) obj[section][$1][i-1] = unquote($i)
} else {
# else flat value
obj[section][$1] = unquote($2)
}
}
}
END {
if (isarray(obj)) print serialize(obj, "\t")
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As another option, one could delete everything above
# === FUNCTIONS ===
and use@include "path/to/json.awk"
to import theserialize()
/deserialize()
methods into an existing awk script.