-
-
Save sethhall/386c941a0f778d8b79be03c7fbfd47d0 to your computer and use it in GitHub Desktop.
| module JSON; | |
| import spicy; | |
| # This supports jsonc (json with comments) | |
| %skip = /[ \t\r\n]*(\/\/[^\n]*)*[ \t\r\n]*/; | |
| public type File = unit { | |
| values: JSONValue[]; | |
| }; | |
| type JSONValue = unit { | |
| switch { | |
| -> obj : JSONObject; | |
| -> arr : JSONArray; | |
| -> str : JSONString; | |
| -> bol : JSONBool; | |
| -> nul : JSONNull; | |
| -> num : JSONNumber; | |
| }; | |
| }; | |
| type JSONObject = unit { | |
| : /\{/; | |
| fields: JSONPair[]; | |
| : /\}/; | |
| } &convert=json_object_to_map($$); | |
| type JSONPair = unit { | |
| key: JSONString; | |
| : skip /:/; | |
| value: JSONValue; | |
| : skip /,?/; | |
| }; | |
| type JSONArrayElement = unit { | |
| value: JSONValue; | |
| : skip /,?/; | |
| } &convert=self.value; | |
| type JSONArray = unit { | |
| : skip b"["; | |
| values: JSONArrayElement[]; | |
| : skip b"]"; | |
| } &convert=self.values; | |
| type JSONString = unit { | |
| : /\"/; | |
| value: uint8[] &until=($$ == 34 && (|self.value| == 0 || self.value.back() != 92)); | |
| } &convert=vec_to_str(self.value); | |
| type JSONBool = unit { | |
| value: /false|true/; | |
| } &convert=str_to_bool(self.value); | |
| type JSONNull = unit { | |
| value: /null/; | |
| } &convert=get_null(); | |
| type JSONNumber = unit { | |
| value: /-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?/ &nosub; | |
| } &convert=self.value.to_real(); | |
| ####################### | |
| ## Utility Functions ## | |
| ####################### | |
| function get_null(): optional<bool> { | |
| # I couldn't figure out a better way to return a void value, but this works at least. | |
| local x: optional<bool>; | |
| return x; | |
| } | |
| function json_object_to_map(jo: JSONObject): map<string, JSONValue> { | |
| local m: map<string, JSONValue>; | |
| for ( elem in jo.fields ) { | |
| m[elem.key] = elem.value; | |
| } | |
| return m; | |
| } | |
| function vec_to_str(vec: vector<uint8>): string { | |
| local out = b""; | |
| local escaping = False; | |
| local reading_unicode_val = 0; | |
| local unicode_vals: vector<uint8>&; | |
| for ( i in vec ) { | |
| local x = i; | |
| if ( reading_unicode_val > 0 ) { | |
| --reading_unicode_val; | |
| unicode_vals.push_back(x); | |
| if ( reading_unicode_val == 0 ) { | |
| # TODO: turn the unicode_vals vector into a real unicode character... but how? | |
| # right now this just packs the replacement character "�" | |
| # I still don't even fully understand why unicode has escaping like this... | |
| out += 0xEF; | |
| out += 0xBF; | |
| out += 0xBD; | |
| } | |
| continue; | |
| } else if ( ! escaping && x == 92 ) { | |
| # Begin handling backslash escaping | |
| escaping = True; | |
| continue; | |
| } else if ( escaping ) { | |
| escaping = False; | |
| switch ( x ) { | |
| case 0x62: { | |
| # "b" - backspace | |
| x = 0x08; | |
| } | |
| case 0x66: { | |
| # "f" - formfeed | |
| x = 0x0C; | |
| } | |
| case 0x6E: { | |
| # "n" - newline | |
| x = 0x0A; | |
| } | |
| case 0x72: { | |
| # "r" - carriage return | |
| x = 0x0D; | |
| } | |
| case 0x74: { | |
| # "t" - horizontal tab | |
| x = 0x09; | |
| } | |
| case 0x75: { | |
| # "u" - unicode escape begin | |
| # Read the next 4 bytes | |
| reading_unicode_val = 4; | |
| unicode_vals = new vector<uint8>(4); | |
| continue; | |
| } | |
| default: { | |
| # do nothing and pass the value straight thru. | |
| x = x; | |
| } | |
| } | |
| } | |
| out += pack(x, spicy::ByteOrder::Network); | |
| } | |
| return out.decode(); | |
| } | |
| function str_to_bool(str: bytes): bool { | |
| switch ( str ) { | |
| case b"false": return False; | |
| case b"true": return True; | |
| default: assert False : "Something neither true nor false was fed to 'to_bool'"; | |
| } | |
| } |
sethhall
commented
Jun 4, 2024
via email
I'm working on a change to using %skip instead of all of the explicit whitespace parsing. It works great but I discovered what seems to be a weird behavior and figured out a work around.
In the parsing of elements that are comma separated I was doing this...
type JSONArrayElement = unit {
value: JSONValue;
: /,?/;
};
My thought was that the regex just wouldn't parse anything if there wasn't a comma. But I kept getting this error...
[fatal error] terminating with uncaught exception of type spicy::rt::ParseError: no expected look-ahead token found (json-parse.spicy:43:5)
So I rewrote it to use a switch...
type JSONArrayElement = unit {
value: JSONValue;
switch {
-> comma : b",";
-> v: void;
};
};
Now it seems to parse things fine. Is that an expected behavior?
FWIW, the lookahead that the array element is looking for is defined here...
type JSONArray = unit {
: /\[/;
values: JSONArrayElement[];
: /\]/;
};
@sethhall, do you have a compact reproducer of parser+input for this? I could imagine that lookahead parsing with possibly empty matches might have weird edge cases (though e.g., it seems your parser should always consume some input).
Sorry. Don't have a minimal reproducer yet but I'll try to make one today.
I just published another set of changes after chatting with Robin. Now the parser fully works correctly but we did identify a bug in the lookahead parsing. I'll still try today to get around to creating a minimal reproducer. (it's something about multiple regex fields in a structure after a look ahead with the %skip option set).
Ok, just published some more changes to this. String escaping is working now (except for the \uXXXX thing they do in json, that's a bit more complicated and I don't feel like working on it).
I also updated it to use spicy with the bug fix that just went in regarding multiple regular expressions in a row. Also using the new builtin to_real function on byte arrays.
And now another change set that makes this support jsonc (json with comments)
Fixed an issue with vector handling during string parsing.