Last active
March 14, 2023 19:51
-
-
Save baines/99ef313a57a32f9b3dc4c6d7e31fe37b to your computer and use it in GitHub Desktop.
json tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <assert.h> | |
// streaming json tokenizer that operates on fixed sized memory chunks | |
// no dynamic memory allocation necessary | |
enum ij_type { | |
IJ_NULL, | |
IJ_BOOL_FALSE, | |
IJ_BOOL_TRUE, | |
IJ_INT, | |
IJ_DOUBLE, | |
IJ_STR, | |
IJ_OBJ_OPEN, | |
IJ_OBJ_CLOSE, | |
IJ_ARR_OPEN, | |
IJ_ARR_CLOSE, | |
}; | |
enum { | |
IJ_PARTIAL = -1, | |
IJ_MALFORMED = -2, | |
IJ_OOM = -3, | |
}; | |
struct ij_parser { | |
uint_fast32_t state; | |
uint_fast32_t num_state; | |
uint_fast32_t udigits; | |
uint_fast32_t ucode; | |
const char* literal; | |
char* work_mem; | |
char* work_cur; | |
char* work_end; | |
}; | |
typedef bool (*ij_callback)(struct ij_parser* p, enum ij_type, const char* ptr, size_t len); | |
static int ij_parse_num(struct ij_parser*, const char**, size_t, ij_callback, bool eof); | |
void ij_init(struct ij_parser* ij, char* work_mem, size_t work_mem_size){ | |
assert(work_mem_size > 0); | |
memset(ij, 0, sizeof(*ij)); | |
ij->work_mem = work_mem; | |
ij->work_cur = work_mem; | |
ij->work_end = work_mem + work_mem_size; | |
} | |
static inline char ij_lower(char c){ | |
if(c >= 'A' && c <= 'Z') | |
return c + ('a' - 'A'); | |
return c; | |
} | |
int ij_parse(struct ij_parser* ij, const char* json, size_t json_len, ij_callback callback){ | |
const char* p = json; | |
const char* end = json + json_len; | |
const bool eof = json_len == 0; | |
goto *&&resume_start + ij->state; | |
for(;; ++p){ | |
ij->state = 0; | |
resume_start: | |
if(p == end) | |
break; | |
if(strchr("\r\n\t :,", *p)) | |
continue; | |
static const char objarr[] = "{}[]"; | |
const char* c; | |
int rv; | |
ij->literal = | |
*p == 't' ? "true" : | |
*p == 'f' ? "false" : | |
*p == 'n' ? "null" : | |
NULL; | |
if(ij->literal){ | |
ij->state = &&resume_literal - &&resume_start; | |
resume_literal: | |
for(;;){ | |
if(eof || *p != *ij->literal) | |
return IJ_MALFORMED; | |
if(!*++ij->literal){ | |
break; | |
} | |
if(++p == end) | |
return IJ_PARTIAL; | |
} | |
static const char lookup[] = "lsu"; | |
callback(ij, IJ_NULL + (strchr(lookup, ij->literal[-2]) - lookup), NULL, 0); | |
ij->literal = NULL; | |
} | |
else if(*p == '-' || (*p >= '0' && *p <= '9')){ | |
if(*p == '-'){ | |
*ij->work_cur++ = '-'; | |
++p; | |
} | |
ij->state = &&resume_number - &&resume_start; | |
resume_number: | |
rv = ij_parse_num(ij, &p, end - p, callback, eof); | |
if(rv <= 0) | |
return rv; | |
} | |
else if(*p == '"'){ | |
ij->state = &&resume_str - &&resume_start; | |
++p; | |
resume_str: | |
for(;;){ | |
if(p == end) | |
return IJ_PARTIAL; | |
if(*p == '"'){ | |
rv = callback(ij, IJ_STR, ij->work_mem, ij->work_cur - ij->work_mem); | |
ij->work_cur = ij->work_mem; | |
break; | |
} | |
else if(*p == '\\'){ | |
static const char from[] = "\"\\/bfnrt"; | |
static const char to[] = "\"\\/\b\f\n\r\t"; | |
ij->state = &&resume_str_unescape_1 - &&resume_start; | |
++p; | |
resume_str_unescape_1: | |
if(p == end) | |
return IJ_PARTIAL; | |
if(ij->work_cur == ij->work_end) | |
return IJ_OOM; | |
if((c = strchr(from, ij_lower(*p)))){ | |
++p; | |
*ij->work_cur++ = to[c - from]; | |
ij->state = &&resume_str - &&resume_start; | |
} else if(*p == 'u') { | |
++p; | |
ij->state = &&resume_str_unescape_2 - &&resume_start; | |
resume_str_unescape_2: | |
for(; ij->udigits < 4; ++ij->udigits){ | |
if(p == end) | |
return IJ_PARTIAL; | |
static const char hex[] = "0123456789abcdef"; | |
c = strchr(hex, ij_lower(*p)); | |
if(!c) | |
return IJ_MALFORMED; | |
ij->ucode = (ij->ucode << 4) | (c - hex); | |
++p; | |
} | |
// convert to utf-8 | |
static const size_t counts[] = { 0x7f, 0x7ff, 0xffff }; | |
int i; | |
for(i = 0; i < 3; ++i){ | |
if(ij->ucode <= counts[i]) | |
break; | |
} | |
if(i == 3) | |
return IJ_MALFORMED; | |
if(ij->work_end - ij->work_cur < (i+1)) | |
return IJ_OOM; | |
if(i == 0){ | |
*ij->work_cur++ = ij->ucode; | |
} else if(i == 1){ | |
*ij->work_cur++ = (ij->ucode >> 6) | 0xc0; | |
*ij->work_cur++ = (ij->ucode & 0x3f) | 0x80; | |
} else { | |
*ij->work_cur++ = (ij->ucode >> 12) | 0xe0; | |
*ij->work_cur++ = ((ij->ucode >> 6) & 0x3f) | 0x80; | |
*ij->work_cur++ = (ij->ucode & 0x3f) | 0x80; | |
} | |
ij->udigits = ij->ucode = 0; | |
ij->state = &&resume_str - &&resume_start; | |
} else { | |
return IJ_MALFORMED; | |
} | |
} else { | |
if(ij->work_cur == ij->work_end) | |
return IJ_OOM; | |
*ij->work_cur++ = *p++; | |
} | |
} | |
} | |
else if((c = strchr(objarr, *p))){ | |
rv = callback(ij, IJ_OBJ_OPEN + (c - objarr), NULL, 0); | |
if(rv <= 0) | |
return rv; | |
} | |
else { | |
return IJ_MALFORMED; | |
} | |
} | |
return 0; | |
} | |
static inline int ij_parse_num(struct ij_parser* ij, const char** json, size_t json_len, ij_callback callback, bool eof){ | |
const char* start = *json; | |
if(eof) | |
goto eof_check; | |
enum { | |
IJ_NS_INITIAL, | |
IJ_NS_ZERO, | |
IJ_NS_DIGIT, | |
IJ_NS_MANTISSA, | |
IJ_NS_EXP_SIGN, | |
IJ_NS_EXPONENT, | |
}; | |
while(*json - start < json_len){ | |
char c = **json; | |
int new_state = -1; | |
if(strchr(",}]: \t\r\n", c)){ | |
int rv; | |
eof_check: | |
switch(ij->num_state){ | |
case IJ_NS_ZERO: | |
case IJ_NS_DIGIT: { | |
rv = callback(ij, IJ_INT, ij->work_mem, ij->work_cur - ij->work_mem); | |
} break; | |
case IJ_NS_MANTISSA: | |
case IJ_NS_EXPONENT: { | |
rv = callback(ij, IJ_DOUBLE, ij->work_mem, ij->work_cur - ij->work_mem); | |
} break; | |
default: | |
rv = IJ_MALFORMED; | |
} | |
--*json; | |
ij->work_cur = ij->work_mem; | |
ij->num_state = IJ_NS_INITIAL; | |
return rv; | |
} | |
switch(ij->num_state){ | |
case IJ_NS_INITIAL: { | |
if(c == '0') | |
new_state = IJ_NS_ZERO; | |
else if(c >= '1' && c <= '9') | |
new_state = IJ_NS_DIGIT; | |
} break; | |
case IJ_NS_ZERO: { | |
if(c == '.') | |
new_state = IJ_NS_MANTISSA; | |
else if(c == 'e' || c == 'E') | |
new_state = IJ_NS_EXP_SIGN; | |
} break; | |
case IJ_NS_DIGIT: { | |
if(c >= '0' && c <= '9') | |
new_state = IJ_NS_DIGIT; | |
else if(c == '.') | |
new_state = IJ_NS_MANTISSA; | |
else if(c == 'e' || c == 'E') | |
new_state = IJ_NS_EXP_SIGN; | |
} break; | |
case IJ_NS_MANTISSA: { | |
if(c >= '0' && c <= '9') | |
new_state = IJ_NS_MANTISSA; | |
else if(c == 'e' || c == 'E') | |
new_state = IJ_NS_EXP_SIGN; | |
} break; | |
case IJ_NS_EXP_SIGN: { | |
if(c == '+' || c == '-' || (c >= '0' && c <= '9')) | |
new_state = IJ_NS_EXPONENT; | |
} break; | |
case IJ_NS_EXPONENT: { | |
if(c >= '0' && c <= '9') | |
new_state = IJ_NS_EXPONENT; | |
} break; | |
} | |
if(new_state == -1){ | |
ij->num_state = IJ_NS_INITIAL; | |
return IJ_MALFORMED; | |
} | |
if(ij->work_cur == ij->work_end){ | |
ij->num_state = IJ_NS_INITIAL; | |
return IJ_OOM; | |
} | |
ij->num_state = new_state; | |
*ij->work_cur++ = *(*json)++; | |
} | |
return IJ_PARTIAL; | |
} | |
// 0-malloc pretty print example, reformats arbitrarily large files | |
// (with trailing commas...) | |
// gcc -xc -DTEST inso_json.h -o ij | |
// ./ij < stuff.json | |
#ifdef TEST | |
#include <stdlib.h> | |
#include <stdio.h> | |
struct state { | |
struct ij_parser p; | |
int nesting; | |
__uint128_t in_obj; | |
bool obj_key; | |
}; | |
bool ij_cb(struct ij_parser* p, enum ij_type type, const char* data, size_t len){ | |
struct state* state = (struct state*)p; | |
bool in_obj = state->in_obj & (1 << state->nesting); | |
#define indent (in_obj ? state->obj_key ? state->nesting * 2 : 0 : state->nesting * 2), "" | |
switch(type){ | |
case IJ_NULL: | |
printf("%*s\e[0;35mnull\e[0m,\n", indent); | |
break; | |
case IJ_BOOL_FALSE: | |
printf("%*s\e[0;35mfalse\e[0m,\n", indent); | |
break; | |
case IJ_BOOL_TRUE: | |
printf("%*s\e[0;35mtrue\e[0m,\n", indent); | |
break; | |
case IJ_INT: | |
printf("%*s\e[0;33m%lld\e[0m,\n", indent, strtoll(strndupa(data, len), NULL, 10)); | |
break; | |
case IJ_DOUBLE: | |
printf("%*s\e[0;33m%g\e[0m,\n", indent, strtod(data, NULL)); | |
break; | |
case IJ_STR: | |
if(state->obj_key){ | |
printf("%*s\e[1;34m\"%.*s\"\e[0m: ", indent, (int)len, data); | |
} else { | |
printf("%*s\e[0;32m\"%.*s\"\e[0m,\n", indent, (int)len, data); | |
} | |
break; | |
case IJ_OBJ_OPEN: { | |
printf("%*s{\n", indent); | |
++state->nesting; | |
assert(state->nesting < 128); | |
state->in_obj |= (1 << state->nesting); | |
state->obj_key = false; | |
in_obj = true; | |
} break; | |
case IJ_OBJ_CLOSE: { | |
state->in_obj &= ~(1 << state->nesting); | |
--state->nesting; | |
assert(state->nesting >= 0); | |
printf("%*s},\n", indent); | |
state->obj_key = false; | |
in_obj = state->in_obj & (1 << state->nesting); | |
} break; | |
case IJ_ARR_OPEN: { | |
state->obj_key = false; | |
printf("%*s[\n", indent); | |
++state->nesting; | |
assert(state->nesting < 128); | |
in_obj = false; | |
} break; | |
case IJ_ARR_CLOSE: { | |
--state->nesting; | |
assert(state->nesting >= 0); | |
printf("%*s],\n", indent); | |
state->obj_key = false; | |
in_obj = state->in_obj & (1 << state->nesting); | |
} break; | |
} | |
#undef indent | |
if(in_obj){ | |
state->obj_key = !state->obj_key; | |
} | |
return true; | |
} | |
static const char* stati[] = { | |
"Ok", "Completed partial tokenize", "Malformed input", "Out of working memory" | |
}; | |
int main(int argc, char** argv){ | |
char mem[1024]; // working mem size limits the largest parsable string | |
char buf[32]; // intermediary buf can be as small as 2 bytes - you can parse chunks e.g. as they download | |
struct state state = {}; | |
ij_init(&state.p, mem, sizeof(mem)); | |
int status; | |
for(;;){ | |
status = 0; | |
if(!fgets(buf, sizeof(buf), stdin)) | |
break; | |
status = ij_parse(&state.p, buf, strlen(buf), &ij_cb); | |
if(status < IJ_PARTIAL){ | |
break; | |
} | |
} | |
if(status == 0){ | |
// to finish partial numbers | |
status = ij_parse(&state.p, NULL, 0, &ij_cb); | |
} | |
fprintf(stderr, "Status: %s.\n", stati[-status]); | |
return 0; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment