JSON parser in awk by DeepSeek R1

To parse JSON in awk, you can use a combination of tokenization and recursive parsing. Below is a function that converts a JSON string into an associative array with keys representing the JSON structure's hierarchy (e.g., object.key.array[0]):

function tokenize(json, tokens,    pos, len, c, regex, match_str) {
    pos = 1
    len = length(json)
    delete tokens

    while (pos <= len) {
        # Skip whitespace
        if (match(substr(json, pos), /^[ \t\n\r]+/)) {
            pos += RLENGTH
            continue
        }

        c = substr(json, pos, 1)

        if (c ~ /[{\[}\]:,]/) {
            tokens[length(tokens)+1] = c
            pos += 1
        } else if (c == "\"") {
            if (match(substr(json, pos), /^"(([^"\\])|(\\.))*"/)) {
                match_str = substr(json, pos + 1, RLENGTH - 2)
                gsub(/\\"/, "\"", match_str)
                gsub(/\\\\/, "\\", match_str)
                gsub(/\\\//, "/", match_str)
                gsub(/\\b/, "\b", match_str)
                gsub(/\\f/, "\f", match_str)
                gsub(/\\n/, "\n", match_str)
                gsub(/\\r/, "\r", match_str)
                gsub(/\\t/, "\t", match_str)
                tokens[length(tokens)+1] = match_str
                pos += RLENGTH
            } else {
                return 0
            }
        } else if (c ~ /[-0-9]/) {
            if (match(substr(json, pos), /^-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)?/)) {
                tokens[length(tokens)+1] = substr(json, pos, RLENGTH)
                pos += RLENGTH
            } else {
                return 0
            }
        } else if (c == "t" && substr(json, pos, 4) == "true") {
            tokens[length(tokens)+1] = "true"
            pos += 4
        } else if (c == "f" && substr(json, pos, 5) == "false") {
            tokens[length(tokens)+1] = "false"
            pos += 5
        } else if (c == "n" && substr(json, pos, 4) == "null") {
            tokens[length(tokens)+1] = "null"
            pos += 4
        } else {
            return 0
        }
    }
    return 1
}

function parse_value(tokens, idx, result, path) {
    if (tokens[idx] == "{") {
        idx = parse_object(tokens, idx + 1, result, path)
    } else if (tokens[idx] == "[") {
        idx = parse_array(tokens, idx + 1, result, path)
    } else if (tokens[idx] == "true" || tokens[idx] == "false" || tokens[idx] == "null") {
        result[path] = tokens[idx]
        idx += 1
    } else if (tokens[idx] ~ /^-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?/) {
        result[path] = 0 + tokens[idx]
        idx += 1
    } else {
        result[path] = tokens[idx]
        idx += 1
    }
    return idx
}

function parse_object(tokens, idx, result, path,    key, new_path) {
    while (tokens[idx] != "}") {
        if (tokens[idx] == ",") {
            idx += 1
        }
        key = tokens[idx]
        idx += 1  # Consume key
        idx += 1  # Consume colon
        new_path = (path == "") ? key : (path "." key)
        idx = parse_value(tokens, idx, result, new_path)
    }
    return idx + 1  # Consume '}'
}

function parse_array(tokens, idx, result, path,    i, new_path) {
    i = 0
    while (tokens[idx] != "]") {
        if (tokens[idx] == ",") {
            idx += 1
        }
        new_path = path "[" i "]"
        idx = parse_value(tokens, idx, result, new_path)
        i += 1
    }
    return idx + 1  # Consume ']'
}

function parse_json(json, result) {
    if (!tokenize(json, tokens)) {
        return 0
    }
    delete result
    if (parse_value(tokens, 1, result, "") > length(tokens)) {
        return 1
    }
    return 1
}

Usage Example:

BEGIN {
    json = "{\"name\":\"John\", \"age\":30, \"cars\":[\"Ford\", \"BMW\"], \"nested\":{\"a\":true, \"b\":null}}"
    if (parse_json(json, data)) {
        print "Name:", data["name"]          # Output: John
        print "Age:", data["age"]            # Output: 30
        print "Car 0:", data["cars[0]"]      # Output: Ford
        print "Car 1:", data["cars[1]"]      # Output: BMW
        print "Nested.a:", data["nested.a"]  # Output: true
        print "Nested.b:", data["nested.b"]  # Output: null
    }
}

Explanation:

Tokenization: The tokenize function converts the JSON string into tokens (strings, numbers, braces, etc.), handling escape sequences and syntax.
Parsing Functions:
- parse_value directs parsing based on the current token (object, array, or primitive).
- parse_object processes key-value pairs, building hierarchical keys (e.g., nested.a).
- parse_array processes elements, appending indices (e.g., cars[0]).
Output Storage: Parsed values are stored in an associative array (result) with composite keys reflecting the JSON structure.

This implementation handles basic JSON structures, including nested objects and arrays, strings, numbers, booleans, and nulls. Note that it does not support all JSON features (e.g., Unicode escapes), but covers common use cases.

mohd-akram/deepseek-json-awk.md