Skip to content

Instantly share code, notes, and snippets.

@calraith
Last active March 7, 2021 05:56
Show Gist options
  • Save calraith/5c85d17b28e96f49e067f9b054a6fdbb to your computer and use it in GitHub Desktop.
Save calraith/5c85d17b28e96f49e067f9b054a6fdbb to your computer and use it in GitHub Desktop.
gawk JSON serializer / deserializer. See the END {...} section for example usage, as well as the discussion below this script for further examples. Read the syntax comments within the function definitions.
#!/usr/bin/gawk -f
# Example usage: LC_ALL=C ./json.awk datafile.json
{ json = json $0 }
END {
# Build "obj" as a true multidimensional array from string data
deserialize(json, obj)
# Retrieve a value. For numerically indexed arrays, the first element is 1, not 0.
# print obj[1]["name"]
# Stringify a multidimensional array. Indent with tabs.
if (isarray(obj)) print serialize(obj, "\t")
}
# === FUNCTIONS ===
function join(arr, sep, _p, i) {
# syntax: join(array, string separator)
# returns a string
for (i in arr) {
_p["result"] = _p["result"] ~ "[[:print:]]" ? _p["result"] sep arr[i] : arr[i]
}
return _p["result"]
}
function quote(str) {
gsub(/\\/, "\\\\", str)
gsub(/\r/, "\\r", str)
gsub(/\n/, "\\n", str)
gsub(/\t/, "\\t", str)
return "\"" str "\""
}
function serialize(arr, indent_with, depth, _p, i, idx) {
# syntax: serialize(array of arrays, indent string)
# returns a JSON formatted string
# sort arrays on key, ensures [...] values remain properly ordered
if (!PROCINFO["sorted_in"]) PROCINFO["sorted_in"] = "@ind_num_asc"
# determine whether array is indexed or associative
for (i in arr) {
_p["assoc"] = or(_p["assoc"], !(++_p["idx"] in arr))
}
# if associative, indent
if (_p["assoc"]) {
for (i = ++depth; i--;) {
_p["end"] = _p["indent"]; _p["indent"] = _p["indent"] indent_with
}
}
for (i in arr) {
# If key length is 0, assume its an empty object
if (!length(i)) return "{}"
# quote key if not already quoted
_p["key"] = i !~ /^".*"$/ ? quote(i) : i
if (isarray(arr[i])) {
if (_p["assoc"]) {
_p["json"][++idx] = _p["indent"] _p["key"] ": " \
serialize(arr[i], indent_with, depth)
} else {
# if indexed array, dont print keys
_p["json"][++idx] = serialize(arr[i], indent_with, depth)
}
} else {
# quote if not numeric, boolean, null, already quoted, or too big for match()
if (!((arr[i] ~ /^[0-9]+([\.e][0-9]+)?$/ && arr[i] !~ /^0[0-9]/) ||
arr[i] ~ /^true|false|null|".*"$/) || length(arr[i]) > 1000)
arr[i] = quote(arr[i])
_p["json"][++idx] = _p["assoc"] ? _p["indent"] _p["key"] ": " arr[i] : arr[i]
}
}
# I trial and errored the hell out of this. Problem is, gawk can't distinguish between
# a value of null and no value. I think this hack is as close as I can get, although
# [""] will become [].
if (!_p["assoc"] && join(_p["json"]) == "\"\"") return "[]"
# surround with curly braces if object, square brackets if array
return _p["assoc"] ? "{\n" join(_p["json"], ",\n") "\n" _p["end"] "}" \
: "[" join(_p["json"], ", ") "]"
}
function trim(str) { gsub(/^\s+|\s+$/, "", str); return str }
function unquote(str) {
gsub(/^'|'$/, "", str)
gsub(/^"|"$/, "", str)
gsub(/\\r/, "\r", str)
gsub(/\\n/, "\n", str)
gsub(/\\t/, "\t", str)
gsub(/\\{2}/, "\\", str)
return trim(str)
}
function tokenize(str, arr, splitchar, _p) {
# syntax: tokenize(JSON-formatted string, array to populate, char to split on)
# arr populates with matches split on unbracketed, unquoted splitchar
# returns length of arr
# This function supplants fpat / patsplit since those methods cannot reliably group
# mated bracket pairs
while (++_p["pos"] <= length(str)) {
_p["char"] = substr(str, _p["pos"], 1)
switch (_p["char"]) {
case "[": if (!_p["\""] && !_p["\\"]) _p["["]++; _p["\\"] = false; break
case "{": if (!_p["\""] && !_p["\\"]) _p["{"]++; _p["\\"] = false; break
case "}": if (!_p["\""] && !_p["\\"]) _p["{"]--; _p["\\"] = false; break
case "]": if (!_p["\""] && !_p["\\"]) _p["["]--; _p["\\"] = false; break
case "\"": if (!_p["\\"]) _p["\""] = !_p["\""]; _p["\\"] = false; break
case "\\": _p["\\"] = !_p["\\"]; break
default: _p["\\"] = false
}
if (_p["char"] == splitchar && !_p["["] && !_p["{"] && !_p["\""] && !_p["\\"]) {
arr[++_p["idx"]] = trim(_p["segment"])
delete _p["segment"]
} else {
_p["segment"] = _p["segment"] _p["char"]
}
}
arr[++_p["idx"]] = trim(_p["segment"])
return _p["idx"]
}
function deserialize(json, arr, _p, _parts, _values, _keyval, i, j) {
# syntax: deserialize (JSON-formatted string, array to populate)
# Resulting array is true multidimensional (arr[idx][idx][etc...])
# ... not concatenated index (arr[idx,idx,etc...])
# consume outer brackets / braces
# note: match() failed here with very large JSON data
json = trim(json)
_parts[1] = substr(json, 1, 1)
_parts[2] = substr(json, 2, length(json) - 2)
# split on unbracketed, unquoted commas
_p["outie"] = tokenize(trim(_parts[2]), _values, ",")
for (i = 1; i <= _p["outie"]; i++) {
# build associative array
if (_parts[1] ~ "{") {
# split on unbracketed, unquoted colons
_p["innie"] = tokenize(trim(_values[i]), _keyval, ":")
for (j=1; j<=_p["innie"]; j+=2) {
# if value begins with a bracket or brace, recurse
if (trim(_keyval[j+1]) ~ /^[\[\{]/) {
# init array element as explicit array (defaults to scalar without this)
arr[unquote(_keyval[j])][0]; delete arr[unquote(_keyval[j])][0]
# do recurse
deserialize(_keyval[j+1], arr[unquote(_keyval[j])])
} else {
arr[unquote(_keyval[j])] = unquote(_keyval[j+1])
}
}
# build numerically indexed array
} else {
while (++_p["idx"] in arr) {}
# if value begins with a bracket or brace, recurse
if (trim(_values[i]) ~ /^[\[\{]/) {
# init array element as explicit array (defaults to scalar without this)
arr[_p["idx"]][0]; delete arr[_p["idx"]][0]
# do recurse
deserialize(trim(_values[i]), arr[_p["idx"]])
} else {
arr[_p["idx"]] = unquote(_values[i])
}
}
}
}
@calraith
Copy link
Author

As another option, one could delete everything above # === FUNCTIONS === and use @include "path/to/json.awk" to import the serialize() / deserialize() methods into an existing awk script.

@calraith
Copy link
Author

Here's a practical example. Do something like this to translate csv input into JSON output:

BEGIN {
    # use field pattern rather than field separator to split input on unquoted commas
    FPAT = "([^,]*)|(\"[^\"]+\")"

    # read and store the first row as a header row
    getline header

    # split the header row into column definitions
    patsplit(header, cols)
}
# For each line of input....
{
    # For each field...
    for (i = 1; i <= NF; i++) {
        # obj["results"][row][key] = value
        obj["results"][NR - 1][cols[i]] = $i
    }
}
END { print serialize(obj, "\t") }

@calraith
Copy link
Author

calraith commented Jun 1, 2018

Or you can translate an INI file into JSON:

# split on unquoted equals and whitespace
BEGIN { FPAT = "([^=[:space:]]*)|(\"[^\"]+\")" }

# remove carriage returns
{ gsub(/\r/, "") }

# match non-blank, non-comment lines
$1 ~ /^[^#;]/ {
    switch ($0) {
        # match [str...]
        case /^\[.+\]/:
            section = gensub(/[\[\]]/, "", "g")
            break
        # match / skip lines not containing =
        case /^[^=]*$/: break
        default:
            if (NF > 2) {
                # if value contains multiple tokens, create array
                for (i = 2; i <= NF; i++) obj[section][$1][i-1] = unquote($i)
            } else {
                # else flat value
                obj[section][$1] = unquote($2)
            }
    }
}

END {
    if (isarray(obj)) print serialize(obj, "\t")
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment