Skip to content

Instantly share code, notes, and snippets.

@baines
Last active March 14, 2023 19:51
Show Gist options
  • Save baines/99ef313a57a32f9b3dc4c6d7e31fe37b to your computer and use it in GitHub Desktop.
Save baines/99ef313a57a32f9b3dc4c6d7e31fe37b to your computer and use it in GitHub Desktop.
json tokenizer
#define _GNU_SOURCE
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
// streaming json tokenizer that operates on fixed sized memory chunks
// no dynamic memory allocation necessary
enum ij_type {
IJ_NULL,
IJ_BOOL_FALSE,
IJ_BOOL_TRUE,
IJ_INT,
IJ_DOUBLE,
IJ_STR,
IJ_OBJ_OPEN,
IJ_OBJ_CLOSE,
IJ_ARR_OPEN,
IJ_ARR_CLOSE,
};
enum {
IJ_PARTIAL = -1,
IJ_MALFORMED = -2,
IJ_OOM = -3,
};
struct ij_parser {
uint_fast32_t state;
uint_fast32_t num_state;
uint_fast32_t udigits;
uint_fast32_t ucode;
const char* literal;
char* work_mem;
char* work_cur;
char* work_end;
};
typedef bool (*ij_callback)(struct ij_parser* p, enum ij_type, const char* ptr, size_t len);
static int ij_parse_num(struct ij_parser*, const char**, size_t, ij_callback, bool eof);
void ij_init(struct ij_parser* ij, char* work_mem, size_t work_mem_size){
assert(work_mem_size > 0);
memset(ij, 0, sizeof(*ij));
ij->work_mem = work_mem;
ij->work_cur = work_mem;
ij->work_end = work_mem + work_mem_size;
}
static inline char ij_lower(char c){
if(c >= 'A' && c <= 'Z')
return c + ('a' - 'A');
return c;
}
int ij_parse(struct ij_parser* ij, const char* json, size_t json_len, ij_callback callback){
const char* p = json;
const char* end = json + json_len;
const bool eof = json_len == 0;
goto *&&resume_start + ij->state;
for(;; ++p){
ij->state = 0;
resume_start:
if(p == end)
break;
if(strchr("\r\n\t :,", *p))
continue;
static const char objarr[] = "{}[]";
const char* c;
int rv;
ij->literal =
*p == 't' ? "true" :
*p == 'f' ? "false" :
*p == 'n' ? "null" :
NULL;
if(ij->literal){
ij->state = &&resume_literal - &&resume_start;
resume_literal:
for(;;){
if(eof || *p != *ij->literal)
return IJ_MALFORMED;
if(!*++ij->literal){
break;
}
if(++p == end)
return IJ_PARTIAL;
}
static const char lookup[] = "lsu";
callback(ij, IJ_NULL + (strchr(lookup, ij->literal[-2]) - lookup), NULL, 0);
ij->literal = NULL;
}
else if(*p == '-' || (*p >= '0' && *p <= '9')){
if(*p == '-'){
*ij->work_cur++ = '-';
++p;
}
ij->state = &&resume_number - &&resume_start;
resume_number:
rv = ij_parse_num(ij, &p, end - p, callback, eof);
if(rv <= 0)
return rv;
}
else if(*p == '"'){
ij->state = &&resume_str - &&resume_start;
++p;
resume_str:
for(;;){
if(p == end)
return IJ_PARTIAL;
if(*p == '"'){
rv = callback(ij, IJ_STR, ij->work_mem, ij->work_cur - ij->work_mem);
ij->work_cur = ij->work_mem;
break;
}
else if(*p == '\\'){
static const char from[] = "\"\\/bfnrt";
static const char to[] = "\"\\/\b\f\n\r\t";
ij->state = &&resume_str_unescape_1 - &&resume_start;
++p;
resume_str_unescape_1:
if(p == end)
return IJ_PARTIAL;
if(ij->work_cur == ij->work_end)
return IJ_OOM;
if((c = strchr(from, ij_lower(*p)))){
++p;
*ij->work_cur++ = to[c - from];
ij->state = &&resume_str - &&resume_start;
} else if(*p == 'u') {
++p;
ij->state = &&resume_str_unescape_2 - &&resume_start;
resume_str_unescape_2:
for(; ij->udigits < 4; ++ij->udigits){
if(p == end)
return IJ_PARTIAL;
static const char hex[] = "0123456789abcdef";
c = strchr(hex, ij_lower(*p));
if(!c)
return IJ_MALFORMED;
ij->ucode = (ij->ucode << 4) | (c - hex);
++p;
}
// convert to utf-8
static const size_t counts[] = { 0x7f, 0x7ff, 0xffff };
int i;
for(i = 0; i < 3; ++i){
if(ij->ucode <= counts[i])
break;
}
if(i == 3)
return IJ_MALFORMED;
if(ij->work_end - ij->work_cur < (i+1))
return IJ_OOM;
if(i == 0){
*ij->work_cur++ = ij->ucode;
} else if(i == 1){
*ij->work_cur++ = (ij->ucode >> 6) | 0xc0;
*ij->work_cur++ = (ij->ucode & 0x3f) | 0x80;
} else {
*ij->work_cur++ = (ij->ucode >> 12) | 0xe0;
*ij->work_cur++ = ((ij->ucode >> 6) & 0x3f) | 0x80;
*ij->work_cur++ = (ij->ucode & 0x3f) | 0x80;
}
ij->udigits = ij->ucode = 0;
ij->state = &&resume_str - &&resume_start;
} else {
return IJ_MALFORMED;
}
} else {
if(ij->work_cur == ij->work_end)
return IJ_OOM;
*ij->work_cur++ = *p++;
}
}
}
else if((c = strchr(objarr, *p))){
rv = callback(ij, IJ_OBJ_OPEN + (c - objarr), NULL, 0);
if(rv <= 0)
return rv;
}
else {
return IJ_MALFORMED;
}
}
return 0;
}
static inline int ij_parse_num(struct ij_parser* ij, const char** json, size_t json_len, ij_callback callback, bool eof){
const char* start = *json;
if(eof)
goto eof_check;
enum {
IJ_NS_INITIAL,
IJ_NS_ZERO,
IJ_NS_DIGIT,
IJ_NS_MANTISSA,
IJ_NS_EXP_SIGN,
IJ_NS_EXPONENT,
};
while(*json - start < json_len){
char c = **json;
int new_state = -1;
if(strchr(",}]: \t\r\n", c)){
int rv;
eof_check:
switch(ij->num_state){
case IJ_NS_ZERO:
case IJ_NS_DIGIT: {
rv = callback(ij, IJ_INT, ij->work_mem, ij->work_cur - ij->work_mem);
} break;
case IJ_NS_MANTISSA:
case IJ_NS_EXPONENT: {
rv = callback(ij, IJ_DOUBLE, ij->work_mem, ij->work_cur - ij->work_mem);
} break;
default:
rv = IJ_MALFORMED;
}
--*json;
ij->work_cur = ij->work_mem;
ij->num_state = IJ_NS_INITIAL;
return rv;
}
switch(ij->num_state){
case IJ_NS_INITIAL: {
if(c == '0')
new_state = IJ_NS_ZERO;
else if(c >= '1' && c <= '9')
new_state = IJ_NS_DIGIT;
} break;
case IJ_NS_ZERO: {
if(c == '.')
new_state = IJ_NS_MANTISSA;
else if(c == 'e' || c == 'E')
new_state = IJ_NS_EXP_SIGN;
} break;
case IJ_NS_DIGIT: {
if(c >= '0' && c <= '9')
new_state = IJ_NS_DIGIT;
else if(c == '.')
new_state = IJ_NS_MANTISSA;
else if(c == 'e' || c == 'E')
new_state = IJ_NS_EXP_SIGN;
} break;
case IJ_NS_MANTISSA: {
if(c >= '0' && c <= '9')
new_state = IJ_NS_MANTISSA;
else if(c == 'e' || c == 'E')
new_state = IJ_NS_EXP_SIGN;
} break;
case IJ_NS_EXP_SIGN: {
if(c == '+' || c == '-' || (c >= '0' && c <= '9'))
new_state = IJ_NS_EXPONENT;
} break;
case IJ_NS_EXPONENT: {
if(c >= '0' && c <= '9')
new_state = IJ_NS_EXPONENT;
} break;
}
if(new_state == -1){
ij->num_state = IJ_NS_INITIAL;
return IJ_MALFORMED;
}
if(ij->work_cur == ij->work_end){
ij->num_state = IJ_NS_INITIAL;
return IJ_OOM;
}
ij->num_state = new_state;
*ij->work_cur++ = *(*json)++;
}
return IJ_PARTIAL;
}
// 0-malloc pretty print example, reformats arbitrarily large files
// (with trailing commas...)
// gcc -xc -DTEST inso_json.h -o ij
// ./ij < stuff.json
#ifdef TEST
#include <stdlib.h>
#include <stdio.h>
struct state {
struct ij_parser p;
int nesting;
__uint128_t in_obj;
bool obj_key;
};
bool ij_cb(struct ij_parser* p, enum ij_type type, const char* data, size_t len){
struct state* state = (struct state*)p;
bool in_obj = state->in_obj & (1 << state->nesting);
#define indent (in_obj ? state->obj_key ? state->nesting * 2 : 0 : state->nesting * 2), ""
switch(type){
case IJ_NULL:
printf("%*s\e[0;35mnull\e[0m,\n", indent);
break;
case IJ_BOOL_FALSE:
printf("%*s\e[0;35mfalse\e[0m,\n", indent);
break;
case IJ_BOOL_TRUE:
printf("%*s\e[0;35mtrue\e[0m,\n", indent);
break;
case IJ_INT:
printf("%*s\e[0;33m%lld\e[0m,\n", indent, strtoll(strndupa(data, len), NULL, 10));
break;
case IJ_DOUBLE:
printf("%*s\e[0;33m%g\e[0m,\n", indent, strtod(data, NULL));
break;
case IJ_STR:
if(state->obj_key){
printf("%*s\e[1;34m\"%.*s\"\e[0m: ", indent, (int)len, data);
} else {
printf("%*s\e[0;32m\"%.*s\"\e[0m,\n", indent, (int)len, data);
}
break;
case IJ_OBJ_OPEN: {
printf("%*s{\n", indent);
++state->nesting;
assert(state->nesting < 128);
state->in_obj |= (1 << state->nesting);
state->obj_key = false;
in_obj = true;
} break;
case IJ_OBJ_CLOSE: {
state->in_obj &= ~(1 << state->nesting);
--state->nesting;
assert(state->nesting >= 0);
printf("%*s},\n", indent);
state->obj_key = false;
in_obj = state->in_obj & (1 << state->nesting);
} break;
case IJ_ARR_OPEN: {
state->obj_key = false;
printf("%*s[\n", indent);
++state->nesting;
assert(state->nesting < 128);
in_obj = false;
} break;
case IJ_ARR_CLOSE: {
--state->nesting;
assert(state->nesting >= 0);
printf("%*s],\n", indent);
state->obj_key = false;
in_obj = state->in_obj & (1 << state->nesting);
} break;
}
#undef indent
if(in_obj){
state->obj_key = !state->obj_key;
}
return true;
}
static const char* stati[] = {
"Ok", "Completed partial tokenize", "Malformed input", "Out of working memory"
};
int main(int argc, char** argv){
char mem[1024]; // working mem size limits the largest parsable string
char buf[32]; // intermediary buf can be as small as 2 bytes - you can parse chunks e.g. as they download
struct state state = {};
ij_init(&state.p, mem, sizeof(mem));
int status;
for(;;){
status = 0;
if(!fgets(buf, sizeof(buf), stdin))
break;
status = ij_parse(&state.p, buf, strlen(buf), &ij_cb);
if(status < IJ_PARTIAL){
break;
}
}
if(status == 0){
// to finish partial numbers
status = ij_parse(&state.p, NULL, 0, &ij_cb);
}
fprintf(stderr, "Status: %s.\n", stati[-status]);
return 0;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment