To parse JSON in awk, you can use a combination of tokenization and recursive parsing. Below is a function that converts a JSON string into an associative array with keys representing the JSON structure's hierarchy (e.g., object.key.array[0]
):
function tokenize(json, tokens, pos, len, c, regex, match_str) {
pos = 1
len = length(json)
delete tokens
while (pos <= len) {
# Skip whitespace
if (match(substr(json, pos), /^[ \t\n\r]+/)) {
pos += RLENGTH
continue
}
c = substr(json, pos, 1)
if (c ~ /[{\[}\]:,]/) {
tokens[length(tokens)+1] = c
pos += 1
} else if (c == "\"") {
if (match(substr(json, pos), /^"(([^"\\])|(\\.))*"/)) {
match_str = substr(json, pos + 1, RLENGTH - 2)
gsub(/\\"/, "\"", match_str)
gsub(/\\\\/, "\\", match_str)
gsub(/\\\//, "/", match_str)
gsub(/\\b/, "\b", match_str)
gsub(/\\f/, "\f", match_str)
gsub(/\\n/, "\n", match_str)
gsub(/\\r/, "\r", match_str)
gsub(/\\t/, "\t", match_str)
tokens[length(tokens)+1] = match_str
pos += RLENGTH
} else {
return 0
}
} else if (c ~ /[-0-9]/) {
if (match(substr(json, pos), /^-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)?/)) {
tokens[length(tokens)+1] = substr(json, pos, RLENGTH)
pos += RLENGTH
} else {
return 0
}
} else if (c == "t" && substr(json, pos, 4) == "true") {
tokens[length(tokens)+1] = "true"
pos += 4
} else if (c == "f" && substr(json, pos, 5) == "false") {
tokens[length(tokens)+1] = "false"
pos += 5
} else if (c == "n" && substr(json, pos, 4) == "null") {
tokens[length(tokens)+1] = "null"
pos += 4
} else {
return 0
}
}
return 1
}
function parse_value(tokens, idx, result, path) {
if (tokens[idx] == "{") {
idx = parse_object(tokens, idx + 1, result, path)
} else if (tokens[idx] == "[") {
idx = parse_array(tokens, idx + 1, result, path)
} else if (tokens[idx] == "true" || tokens[idx] == "false" || tokens[idx] == "null") {
result[path] = tokens[idx]
idx += 1
} else if (tokens[idx] ~ /^-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?/) {
result[path] = 0 + tokens[idx]
idx += 1
} else {
result[path] = tokens[idx]
idx += 1
}
return idx
}
function parse_object(tokens, idx, result, path, key, new_path) {
while (tokens[idx] != "}") {
if (tokens[idx] == ",") {
idx += 1
}
key = tokens[idx]
idx += 1 # Consume key
idx += 1 # Consume colon
new_path = (path == "") ? key : (path "." key)
idx = parse_value(tokens, idx, result, new_path)
}
return idx + 1 # Consume '}'
}
function parse_array(tokens, idx, result, path, i, new_path) {
i = 0
while (tokens[idx] != "]") {
if (tokens[idx] == ",") {
idx += 1
}
new_path = path "[" i "]"
idx = parse_value(tokens, idx, result, new_path)
i += 1
}
return idx + 1 # Consume ']'
}
function parse_json(json, result) {
if (!tokenize(json, tokens)) {
return 0
}
delete result
if (parse_value(tokens, 1, result, "") > length(tokens)) {
return 1
}
return 1
}
Usage Example:
BEGIN {
json = "{\"name\":\"John\", \"age\":30, \"cars\":[\"Ford\", \"BMW\"], \"nested\":{\"a\":true, \"b\":null}}"
if (parse_json(json, data)) {
print "Name:", data["name"] # Output: John
print "Age:", data["age"] # Output: 30
print "Car 0:", data["cars[0]"] # Output: Ford
print "Car 1:", data["cars[1]"] # Output: BMW
print "Nested.a:", data["nested.a"] # Output: true
print "Nested.b:", data["nested.b"] # Output: null
}
}
Explanation:
-
Tokenization: The
tokenize
function converts the JSON string into tokens (strings, numbers, braces, etc.), handling escape sequences and syntax. -
Parsing Functions:
parse_value
directs parsing based on the current token (object, array, or primitive).parse_object
processes key-value pairs, building hierarchical keys (e.g.,nested.a
).parse_array
processes elements, appending indices (e.g.,cars[0]
).
-
Output Storage: Parsed values are stored in an associative array (
result
) with composite keys reflecting the JSON structure.
This implementation handles basic JSON structures, including nested objects and arrays, strings, numbers, booleans, and nulls. Note that it does not support all JSON features (e.g., Unicode escapes), but covers common use cases.