-
-
Save sethhall/386c941a0f778d8b79be03c7fbfd47d0 to your computer and use it in GitHub Desktop.
module JSON; | |
import spicy; | |
# This supports jsonc (json with comments) | |
%skip = /[ \t\r\n]*(\/\/[^\n]*)*[ \t\r\n]*/; | |
public type File = unit { | |
values: JSONValue[]; | |
}; | |
type JSONValue = unit { | |
switch { | |
-> obj : JSONObject; | |
-> arr : JSONArray; | |
-> str : JSONString; | |
-> bol : JSONBool; | |
-> nul : JSONNull; | |
-> num : JSONNumber; | |
}; | |
}; | |
type JSONObject = unit { | |
: /\{/; | |
fields: JSONPair[]; | |
: /\}/; | |
} &convert=json_object_to_map($$); | |
type JSONPair = unit { | |
key: JSONString; | |
: skip /:/; | |
value: JSONValue; | |
: skip /,?/; | |
}; | |
type JSONArrayElement = unit { | |
value: JSONValue; | |
: skip /,?/; | |
} &convert=self.value; | |
type JSONArray = unit { | |
: skip b"["; | |
values: JSONArrayElement[]; | |
: skip b"]"; | |
} &convert=self.values; | |
type JSONString = unit { | |
: /\"/; | |
value: uint8[] &until=($$ == 34 && (|self.value| == 0 || self.value.back() != 92)); | |
} &convert=vec_to_str(self.value); | |
type JSONBool = unit { | |
value: /false|true/; | |
} &convert=str_to_bool(self.value); | |
type JSONNull = unit { | |
value: /null/; | |
} &convert=get_null(); | |
type JSONNumber = unit { | |
value: /-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?/ &nosub; | |
} &convert=self.value.to_real(); | |
####################### | |
## Utility Functions ## | |
####################### | |
function get_null(): optional<bool> { | |
# I couldn't figure out a better way to return a void value, but this works at least. | |
local x: optional<bool>; | |
return x; | |
} | |
function json_object_to_map(jo: JSONObject): map<string, JSONValue> { | |
local m: map<string, JSONValue>; | |
for ( elem in jo.fields ) { | |
m[elem.key] = elem.value; | |
} | |
return m; | |
} | |
function vec_to_str(vec: vector<uint8>): string { | |
local out = b""; | |
local escaping = False; | |
local reading_unicode_val = 0; | |
local unicode_vals: vector<uint8>&; | |
for ( i in vec ) { | |
local x = i; | |
if ( reading_unicode_val > 0 ) { | |
--reading_unicode_val; | |
unicode_vals.push_back(x); | |
if ( reading_unicode_val == 0 ) { | |
# TODO: turn the unicode_vals vector into a real unicode character... but how? | |
# right now this just packs the replacement character "�" | |
# I still don't even fully understand why unicode has escaping like this... | |
out += 0xEF; | |
out += 0xBF; | |
out += 0xBD; | |
} | |
continue; | |
} else if ( ! escaping && x == 92 ) { | |
# Begin handling backslash escaping | |
escaping = True; | |
continue; | |
} else if ( escaping ) { | |
escaping = False; | |
switch ( x ) { | |
case 0x62: { | |
# "b" - backspace | |
x = 0x08; | |
} | |
case 0x66: { | |
# "f" - formfeed | |
x = 0x0C; | |
} | |
case 0x6E: { | |
# "n" - newline | |
x = 0x0A; | |
} | |
case 0x72: { | |
# "r" - carriage return | |
x = 0x0D; | |
} | |
case 0x74: { | |
# "t" - horizontal tab | |
x = 0x09; | |
} | |
case 0x75: { | |
# "u" - unicode escape begin | |
# Read the next 4 bytes | |
reading_unicode_val = 4; | |
unicode_vals = new vector<uint8>(4); | |
continue; | |
} | |
default: { | |
# do nothing and pass the value straight thru. | |
x = x; | |
} | |
} | |
} | |
out += pack(x, spicy::ByteOrder::Network); | |
} | |
return out.decode(); | |
} | |
function str_to_bool(str: bytes): bool { | |
switch ( str ) { | |
case b"false": return False; | |
case b"true": return True; | |
default: assert False : "Something neither true nor false was fed to 'to_bool'"; | |
} | |
} |
I believe the whitespace could be handled through some combination of %skip{,-pre,-post}
properties, see https://docs.zeek.org/projects/spicy/en/latest/programming/parsing.html#meta-data. That would also remove it from look-ahead processing.
I'm working on a change to using %skip instead of all of the explicit whitespace parsing. It works great but I discovered what seems to be a weird behavior and figured out a work around.
In the parsing of elements that are comma separated I was doing this...
type JSONArrayElement = unit {
value: JSONValue;
: /,?/;
};
My thought was that the regex just wouldn't parse anything if there wasn't a comma. But I kept getting this error...
[fatal error] terminating with uncaught exception of type spicy::rt::ParseError: no expected look-ahead token found (json-parse.spicy:43:5)
So I rewrote it to use a switch...
type JSONArrayElement = unit {
value: JSONValue;
switch {
-> comma : b",";
-> v: void;
};
};
Now it seems to parse things fine. Is that an expected behavior?
FWIW, the lookahead that the array element is looking for is defined here...
type JSONArray = unit {
: /\[/;
values: JSONArrayElement[];
: /\]/;
};
@sethhall, do you have a compact reproducer of parser+input for this? I could imagine that lookahead parsing with possibly empty matches might have weird edge cases (though e.g., it seems your parser should always consume some input).
Sorry. Don't have a minimal reproducer yet but I'll try to make one today.
I just published another set of changes after chatting with Robin. Now the parser fully works correctly but we did identify a bug in the lookahead parsing. I'll still try today to get around to creating a minimal reproducer. (it's something about multiple regex fields in a structure after a look ahead with the %skip option set).
Ok, just published some more changes to this. String escaping is working now (except for the \uXXXX thing they do in json, that's a bit more complicated and I don't feel like working on it).
I also updated it to use spicy with the bug fix that just went in regarding multiple regular expressions in a row. Also using the new builtin to_real function on byte arrays.
And now another change set that makes this support jsonc (json with comments)
Fixed an issue with vector handling during string parsing.
Hmm, JSON is mostly unstructured (at least until data is validated against a schema after which you could use e.g., a struct), so IMO a
map
seems to fit this pretty well. I also checked and e.g., Python'sjson
parses objects intodict
s andserde-json
intoMap<String, Value>
, so this seems to be a common approach.