Created
June 8, 2024 18:46
-
-
Save ehaliewicz/f9d0d359fdff64ca4eec4ee723201742 to your computer and use it in GitHub Desktop.
reg thing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// no includes | |
typedef typeof(sizeof(0)) size_t; | |
#define NULL 0 | |
int printf ( const char * format, ... ); | |
void* memcpy(void *dest, const void * src, size_t n); | |
void exit(int); | |
typedef signed long s32; | |
typedef unsigned long u32; | |
typedef signed short s16; | |
typedef unsigned short u16; | |
typedef unsigned char u8; | |
typedef signed char s8; | |
typedef struct { | |
u32 success:1; | |
u32 tgt2_is_cur:1; | |
s32 tgt1:15; | |
s32 tgt2:15; | |
} inst; | |
#define MAX_PROG_LEN 32768 | |
#define EOS 0 | |
#define LANCHOR_CHAR 1 | |
#define RANCHOR_CHAR 2 | |
#define EPSILON_CHAR 3 | |
// program is set up in two streams, instructions, and characters to match | |
// if an instruction being evaluated has the success flag set, we can bail out early with success | |
// if the character matches the current character, we can add tgt1 and tgt2 (relative targets) to the current and next set of states, respectively | |
// however if the tgt2_is_cur flag is set, tgt2 is instead added to the current set (this is used to fork to two locations at once without consuming characters) | |
typedef struct { | |
int length; | |
u8 chars[MAX_PROG_LEN]; | |
inst insts[MAX_PROG_LEN]; | |
} program; | |
typedef enum { | |
OPTION=0, | |
KLEENE=1, | |
PLUS=2, | |
SEQ=3, | |
ALTERNATE=4, | |
LANCHOR=5, | |
RANCHOR=6, | |
MATCH_CHAR=7, | |
MATCH_ANY=8 | |
} node_type; | |
typedef struct node node; | |
struct node { | |
char c; | |
node_type type; | |
node *left, *right; | |
}; | |
int compile_node(node* n, program* comp_output) { | |
char mc; | |
switch(n->type) { | |
case SEQ: do { | |
int ret = compile_node(n->left, comp_output); | |
compile_node(n->right, comp_output); | |
return ret; | |
} while(0); | |
break; | |
case RANCHOR: do { | |
int idx = comp_output->length; | |
comp_output->chars[idx] = RANCHOR_CHAR; | |
comp_output->insts[idx] = ((inst){.tgt1 = 1}); | |
return comp_output->length++; | |
} while(0); | |
break; | |
case MATCH_ANY: | |
mc = EPSILON_CHAR; | |
case MATCH_CHAR: do { | |
mc = n->c; | |
//printf("CHAR OR ANY\n"); | |
int idx = comp_output->length; | |
comp_output->chars[idx] = mc; | |
comp_output->insts[idx] = ((inst){.tgt2 = 1}); | |
return comp_output->length++; | |
} while(0); | |
break; | |
case ALTERNATE: do { | |
int fork_loc = comp_output->length++; | |
int left_expr_loc = compile_node(n->left, comp_output); | |
int jump_loc = comp_output->length++; | |
int right_expr_loc = compile_node(n->right, comp_output); | |
int jump_tgt = comp_output->length; | |
// add fork to start of expression and past expression | |
comp_output->chars[fork_loc] = EPSILON_CHAR; | |
comp_output->insts[fork_loc] = ((inst){.tgt1 = (left_expr_loc-fork_loc), | |
.tgt2 = (right_expr_loc-fork_loc), .tgt2_is_cur = 1}); | |
comp_output->chars[jump_loc] = EPSILON_CHAR; | |
comp_output->insts[jump_loc] = ((inst){.tgt1 = (jump_tgt - jump_loc)}); | |
return fork_loc; | |
} while(0); | |
break; | |
case OPTION: do { | |
int fork_loc = comp_output->length++; | |
compile_node(n->left, comp_output); | |
int target = comp_output->length++; | |
comp_output->insts[fork_loc] = ((inst){.tgt1 = 1,.tgt2 = (fork_loc-target), .tgt2_is_cur=1}); | |
comp_output->chars[fork_loc] = EPSILON_CHAR; | |
return fork_loc; | |
} while(0); | |
break; | |
case KLEENE: do { | |
int fork_loc = comp_output->length++; | |
int start_of_kleene_expr = compile_node(n->left, comp_output); | |
int after_loc = comp_output->length++; | |
int jump_tgt = after_loc+1; | |
// add fork to start of expression and past expression | |
comp_output->chars[fork_loc] = EPSILON_CHAR; | |
comp_output->insts[fork_loc] = ((inst){.tgt1 = (jump_tgt-fork_loc), .tgt2 = 1, .tgt2_is_cur = 1}); | |
comp_output->chars[after_loc] = EPSILON_CHAR; | |
comp_output->insts[after_loc] = ((inst){.tgt1 = (fork_loc-after_loc)}); | |
return fork_loc; | |
} while(0); | |
break; | |
default: | |
printf("Unexpected node type %i\n", n->type); | |
exit(1); | |
} | |
} | |
program* compile_tree(node* n, program* p) { | |
compile_node(n, p); | |
p->insts[p->length] = ((inst){.success=1}); | |
p->chars[p->length++] = EPSILON_CHAR; | |
return p; | |
} | |
/* | |
vector ("SIMD") stuff | |
the idea here is to write this all in normal C, then convert to intrinsics | |
*/ | |
typedef struct { | |
u16 els[8]; | |
} vector_u16; | |
typedef struct { | |
u32 els[8]; | |
} vector_u32; | |
typedef struct { | |
char els[8]; | |
} vector_u8; | |
void print_inst(inst i){ | |
printf("s%i:f%i:t%i:c%i", i.success, i.tgt1, i.tgt2, i.tgt2_is_cur); | |
} | |
void print_inst_vec(vector_u32 v) { | |
printf("<"); | |
for(int i = 0; i < 8; i++) { | |
u32 el = v.els[i]; | |
inst* iptr = (inst*)⪙ | |
print_inst(*iptr); printf(", "); | |
} | |
printf(">\n"); | |
} | |
void print_char_vec(vector_u8 v) { | |
printf("<"); | |
for(int i = 0; i < 8; i++) { | |
printf("'%c' ", v.els[i]); | |
} | |
printf(">\n"); | |
} | |
void print_bool_vec(vector_u8 v) { | |
printf("<"); | |
for(int i = 0; i < 8; i++) { | |
printf("%i", v.els[i]?1:0); | |
} | |
printf(">\n"); | |
} | |
u8 popcnt_u8_vec(vector_u8 v) { | |
char c = 0; | |
for(int i = 0; i < 8; i++) { | |
c += (v.els[i] ? 1 : 0); | |
} | |
return c; | |
} | |
vector_u16 load_u16s_vector_with_offset(u16* array, u16 offset) { | |
vector_u16 res; | |
for(int i = 0; i < 8; i++) { | |
res.els[i] = array[i+offset]; | |
} | |
return res; | |
} | |
vector_u32 gather_u32s_idx_u16s(u32* array, vector_u16 idxs) { | |
vector_u32 u32s; | |
for(int i = 0; i < 8; i++) { | |
u32s.els[i] = array[idxs.els[i]]; | |
} | |
return u32s; | |
} | |
vector_u8 gather_u8s_idx_u16s(u8* array, vector_u16 idxs) { | |
vector_u8 u8s; | |
for(int i = 0; i < 8; i++) { | |
u8s.els[i] = array[idxs.els[i]]; | |
} | |
return u8s; | |
} | |
vector_u8 scalar_to_u8_vec(u8 a) { | |
return ((vector_u8){.els = {a,a,a,a,a,a,a,a}}); | |
} | |
vector_u16 scalar_to_u16_vec(u8 a) { | |
return ((vector_u16){.els = {a,a,a,a,a,a,a,a}}); | |
} | |
vector_u8 u8_vec_eq(vector_u8 a, vector_u8 b) { | |
vector_u8 res; | |
for(int ii = 0; ii < 8; ii++) { | |
res.els[ii] = a.els[ii] == b.els[ii]; | |
} | |
return res; | |
} | |
vector_u8 u16_vec_neq(vector_u16 a, vector_u16 b) { | |
vector_u8 res; | |
for(int ii = 0; ii < 8; ii++) { | |
res.els[ii] = a.els[ii] == b.els[ii]; | |
} | |
return res; | |
} | |
vector_u8 u8_vec_and(vector_u8 a, vector_u8 b) { | |
vector_u8 res; | |
for(int ii = 0; ii < 8; ii++) { | |
res.els[ii] = a.els[ii] & b.els[ii]; | |
} | |
return res; | |
} | |
vector_u8 u8_vec_or(vector_u8 a, vector_u8 b) { | |
vector_u8 res; | |
for(int ii = 0; ii < 8; ii++) { | |
res.els[ii] = a.els[ii] | b.els[ii]; | |
} | |
return res; | |
} | |
vector_u8 u8_vec_to_bits(vector_u8 a) { | |
vector_u8 res; | |
for(int i = 0; i < 8; i++) { | |
res.els[i] = a.els[i] ? 1 : 0; | |
} | |
return res; | |
} | |
#define min(a,b) ((a) < (b) ? (a) : (b)) | |
char getchar(); | |
//#define DEBUG | |
int match(program* p, const char *input) { | |
u8 cur_states_set[MAX_PROG_LEN/8] = { 0 }; | |
u8 next_states_set[MAX_PROG_LEN/8] = { 0 }; | |
u16 num_cur_states = 0; | |
u16 num_next_states = 0; | |
u16 cur_states[MAX_PROG_LEN] = { 0 }; | |
u16 next_states[MAX_PROG_LEN] = { 0 }; | |
cur_states_set[0] |= 1; | |
cur_states[num_cur_states++] = 0; | |
u8* prog_chars = p->chars; | |
inst* prog_insts = p->insts; | |
char* ip = (char*)input; | |
char cur_char = *ip++; | |
int state_iterations = 0; | |
int simd_iterations = 0; | |
int state_char_evaluations = 0; | |
do { | |
if(num_cur_states == 0) { | |
#ifdef DEBUG | |
printf("Out of states\n"); | |
#endif | |
break; | |
} | |
state_iterations++; | |
#ifdef DEBUG | |
printf(" \n\n ITERATION %i\n", state_iterations); | |
printf(" CUR CHAR '%c' (idx: %li)\n\n", cur_char, ip-1-input); | |
#endif | |
int is_lanchor_char = (ip == input+1); | |
int is_ranchor_char = (cur_char == '\0'); | |
int i = 0; | |
while(i < num_cur_states) { | |
int rem_states = num_cur_states - i; | |
int num_process_states = min(rem_states, 8); | |
simd_iterations++; | |
state_char_evaluations += num_process_states; | |
#ifdef DEBUG | |
printf("%i total states\n", num_cur_states); | |
printf("cur position in states: %i\n", i); | |
#endif | |
vector_u16 pcs = load_u16s_vector_with_offset(cur_states, i); | |
#ifdef DEBUG | |
printf("processing %i states\n", num_process_states); | |
printf("pcs: "); | |
printf("<"); | |
print_pcs_vec(pcs); | |
printf(">\n"); | |
#endif | |
vector_u8 is_valid_state = { .els = { 0, 0, 0, 0, 0, 0, 0, 0 } }; | |
for(int ii = 0; ii < num_process_states; ii++) { | |
is_valid_state.els[ii] = 1; | |
} | |
#ifdef DEBUG | |
printf("valid states: "); | |
print_bool_vec(is_valid_state); | |
getchar(); | |
#endif | |
vector_u32 insts = gather_u32s_idx_u16s((u32*)prog_insts, pcs); | |
vector_u8 chars = gather_u8s_idx_u16s(prog_chars, pcs); | |
vector_u8 success_bytes; | |
// there should be a function doing this via shifts rather than bit field access, oh well | |
for(int ii = 0; ii < 8; ii++) { | |
success_bytes = ((inst)(insts.els[ii])).success; | |
} | |
vector_u8 is_success = u8_vec_and(is_valid_state, u8_vec_to_bits(success_bytes)); | |
#ifdef DEBUG | |
print_inst_vec(insts); | |
printf("successes: "); | |
print_bool_vec(is_success); | |
#endif | |
if(popcnt_u8_vec(is_success)) { | |
printf("states evaluated %i, state-char evaluations %i, simd iterations %i\n", state_iterations, state_char_evaluations, simd_iterations); | |
printf("%f ilp via simd\n", state_char_evaluations / ((float)simd_iterations)); | |
return simd_iterations; | |
} | |
#ifdef DEBUG | |
print_char_vec(chars); | |
#endif | |
vector_u8 matches_char = u8_vec_eq(chars, scalar_to_u8_vec(cur_char)); | |
vector_u8 is_epsilon = u8_vec_eq(chars, scalar_to_u8_vec(EPSILON_CHAR)); | |
vector_u8 is_lanchor = u8_vec_eq(chars, scalar_to_u8_vec(LANCHOR_CHAR)); | |
vector_u8 is_ranchor = u8_vec_eq(chars, scalar_to_u8_vec(RANCHOR_CHAR)); | |
vector_u8 is_lit_char = u8_vec_to_bits(u8_vec_or(is_epsilon, u8_vec_or(is_lanchor, is_ranchor))); | |
//for(int ii = 0; ii < 8; ii++) { | |
//is_lit_char.els[ii] = (is_epsilon.els[ii] | is_lanchor.els[ii] | is_ranchor.els[ii]) ? 0 : 1; | |
//} | |
#ifdef DEBUG | |
printf("is epsilon: "); | |
print_bool_vec(is_epsilon); | |
printf("is lanchor: "); | |
print_bool_vec(is_lanchor); | |
printf("is ranchor: "); | |
print_bool_vec(is_ranchor); | |
printf("is lit char: "); | |
print_bool_vec(is_lit_char); | |
#endif | |
vector_u8 is_lanchor_and_matches_lanchor = u8_vec_and(is_lanchor, scalar_to_u8_vec(is_lanchor_char)); | |
vector_u8 is_ranchor_and_matches_ranchor = u8_vec_and(is_ranchor, scalar_to_u8_vec(is_ranchor_char)); | |
vector_u8 is_lit_char_and_matches_lit_char = u8_vec_and(is_lit_char, scalar_to_u8_vec(matches_char)); | |
#ifdef DEBUG | |
printf("is lanchor and matches: "); | |
print_bool_vec(is_lanchor_and_matches_lanchor); | |
printf("is ranchor and matches: "); | |
print_bool_vec(is_ranchor_and_matches_ranchor); | |
printf("is lit char and matches: "); | |
print_bool_vec(is_lit_char_and_matches_lit_char); | |
#endif | |
vector_u8 matches = u8_vec_or(is_epsilon, | |
u8_vec_or(is_lanchor_and_matches_lanchor, | |
u8_vec_or(is_ranchor_and_matches_ranchor, | |
is_lit_char_and_matches_lit_char))); | |
#ifdef DEBUG | |
printf("got matches: "); | |
print_bool_vec(matches); | |
#endif | |
vector_u16 rel_tgt1, rel_tgt2; | |
for(int ii = 0; ii < 8; ii++) { | |
rel_tgt1.els[ii] = ((inst)(insts.els[ii])).tgt1; | |
rel_tgt2.els[ii] = ((inst)(insts.els[ii])).tgt2; | |
} | |
vector_u16 abs_tgt1 = u16_vec_add(rel_tgt1, pcs); | |
vector_u16 abs_tgt2 = u16_vec_add(rel_tgt2, pcs); | |
vector_u8 tgt1_valid = u16_vec_neq(rel_tgt1, scalar_to_u8_vec(0)); | |
vector_u8 tgt2_valid = u16_vec_neq(rel_tgt1, scalar_to_u8_vec(0)); | |
vector_u8 tgt1_cur_exists; | |
vector_u8 tgt2_next_exists; | |
vector_u8 tgt2_cur_exists; | |
for(int ii = 0; ii < 8; ii++) { | |
u16 tgt1 = abs_tgt1.pcs[ii]; | |
u16 tgt2 = abs_tgt2.pcs[ii]; | |
#ifdef DEBUG | |
printf("tgt1 %i, tgt2 %i\n", tgt1, tgt2); | |
#endif | |
u16 tgt1_byte_idx = (tgt1>>3); | |
u16 tgt2_byte_idx = (tgt2>>3); | |
u16 tgt1_bit_idx = (tgt1 & 0b111); | |
u16 tgt2_bit_idx = (tgt2 & 0b111); | |
tgt1_cur_exists.els[ii] = (cur_states_set[tgt1_byte_idx] & (1 << tgt1_bit_idx)); | |
tgt2_next_exists.els[ii] = (next_states_set[tgt2_byte_idx] & (1 << tgt2_bit_idx)); | |
#ifdef DEBUG | |
printf("css: %i\n", cur_states_set[tgt1_byte_idx]); | |
printf("tgt1_cur_exists %i\n", tgt1_cur_exists.els[ii]); | |
printf("nss: %i\n", next_states_set[tgt2_byte_idx]); | |
printf("tgt2_next_exists %i\n", tgt2_cur_exists.els[ii]); | |
#endif | |
tgt2_cur_exists.els[ii] = (cur_states_set[tgt2_byte_idx] & (1<<tgt2_bit_idx)); | |
} | |
// we also need to cross reference WITHIN these bitmaps, so that we only add one of the same state at a time. | |
// but i don't feel like doing that right now | |
#ifdef DEBUG | |
printf("tgt1_cur_exists "); | |
print_bool_vec(tgt1_cur_exists); | |
printf("tgt2_next_exists "); | |
print_bool_vec(tgt2_next_exists); | |
printf("tgt2_cur_exists "); | |
print_bool_vec(tgt2_cur_exists); | |
#endif | |
vector_u8 add_to_next_set; | |
vector_u8 add_to_cur_set; | |
vector_u8 add_to_cur_set_tgt2; | |
for(int ii = 0; ii < 8; ii++) { | |
add_to_cur_set.els[ii] = (matches.els[ii] & tgt1_valid.els[ii] & (tgt1_cur_exists.els[ii] == 0) & is_valid_state.els[ii]); | |
add_to_next_set.els[ii] = (matches.els[ii] & tgt2_valid.els[ii] & (tgt2_next_exists.els[ii] == 0) & (!insts.els[ii].tgt2_is_cur) & is_valid_state.els[ii]); | |
add_to_cur_set_tgt2.els[ii] = (matches.els[ii] & tgt2_valid.els[ii] & (tgt2_cur_exists.els[ii] == 0) & (insts.els[ii].tgt2_is_cur) & is_valid_state.els[ii]); | |
} | |
#ifdef DEBUG | |
printf("add to cur set: "); | |
print_bool_vec(add_to_cur_set); | |
printf("add to cur set tgt2: "); | |
print_bool_vec(add_to_cur_set_tgt2); | |
printf("add to next set: "); | |
print_bool_vec(add_to_next_set); | |
#endif | |
vector_u8 masks[8] = { | |
((vector_u8){.els = { 0, 0, 0, 0, 0, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 0, 0, 0, 0, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 0, 0, 0, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 1, 0, 0, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 1, 1, 0, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 1, 1, 1, 0, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 1, 1, 1, 1, 0, 0}}), | |
((vector_u8){.els = { 1, 1, 1, 1, 1, 1, 1, 0}}), | |
}; | |
vector_u8 rel_cur_state_idxs, rel_cur2_state_idxs; | |
vector_u8 rel_next_state_idxs; | |
for(int ii = 0; ii < 8; ii++) { | |
vector_u8 masked_cur, masked_next; | |
for(int jj = 0; jj < 8; jj++) { | |
masked_cur.els[jj] = masks[ii].els[jj] & add_to_cur_set.els[jj]; | |
masked_next.els[jj] = masks[ii].els[jj] & add_to_next_set.els[jj]; | |
} | |
rel_cur_state_idxs.els[ii] = popcnt_char_vec(masked_cur); | |
rel_next_state_idxs.els[ii] = popcnt_char_vec(masked_next); | |
} | |
#ifdef DEBUG | |
printf("rel cur state idxs: "); | |
print_bool_vec(rel_cur_state_idxs); | |
printf("rel next state idxs: "); | |
print_bool_vec(rel_next_state_idxs); | |
printf("\n\n"); | |
#endif | |
for(int ii = 0; ii < 8; ii++) { | |
if(add_to_cur_set.els[ii]) { | |
s16 tgt1 = insts.els[ii].tgt1 + pcs.pcs[ii]; | |
cur_states_set[tgt1>>3] |= (1 << (tgt1&0b111)); | |
#ifdef DEBUG | |
printf("tgt1 %i css: %i\n", tgt1, cur_states_set[tgt1>>3]); | |
printf("should be zero %i\n", (cur_states_set[tgt1>>3] & (1 << (tgt1&0b111)))); | |
#endif | |
cur_states[num_cur_states + rel_cur_state_idxs.els[ii]] = insts.els[ii].tgt1 + pcs.pcs[ii]; | |
} | |
if(add_to_next_set.els[ii]) { | |
s16 tgt2 = insts.els[ii].tgt2 + pcs.pcs[ii]; | |
next_states_set[tgt2>>3] |= (1 << (tgt2&0b111)); | |
next_states[num_next_states + rel_next_state_idxs.els[ii]] = insts.els[ii].tgt2 + pcs.pcs[ii]; | |
} | |
} | |
num_cur_states += popcnt_char_vec(add_to_cur_set); | |
num_next_states += popcnt_char_vec(add_to_next_set); | |
for(int ii = 0; ii < 8; ii++) { | |
vector_u8 masked_cur; | |
for(int jj = 0; jj < 8; jj++) { | |
masked_cur.els[jj] = masks[ii].els[jj] & add_to_cur_set_tgt2.els[jj]; | |
} | |
rel_cur2_state_idxs.els[ii] = popcnt_char_vec(masked_cur); | |
} | |
#ifdef DEBUG | |
printf("rel cur2 state idxs: "); | |
print_bool_vec(rel_cur2_state_idxs); | |
#endif | |
for(int ii = 0; ii < 8; ii++) { | |
if(add_to_cur_set_tgt2.els[ii]) { | |
s16 tgt2 = insts.els[ii].tgt2 + pcs.pcs[ii]; | |
cur_states_set[tgt2>>3] |= (1 << (tgt2&0b111)); | |
cur_states[num_cur_states + rel_cur2_state_idxs.els[ii]] = insts.els[ii].tgt2 + pcs.pcs[ii]; | |
} | |
} | |
for(int ii = 0; ii < MAX_PROG_LEN/8; ii++) { | |
cur_states_set[ii] = next_states_set[ii]; | |
next_states_set[ii] = 0; | |
} | |
num_cur_states += popcnt_char_vec(add_to_cur_set_tgt2); | |
i += num_process_states; | |
} | |
num_cur_states = num_next_states; | |
memcpy(cur_states, next_states, sizeof(u16)*num_cur_states); | |
num_next_states = 0; | |
if(cur_char == '\0') { | |
break; | |
} | |
cur_char = *ip++; | |
} while(1); | |
printf("states evaluated %i, state-char evaluations %i, simd iterations %i\n", state_iterations, state_char_evaluations, simd_iterations); | |
printf("%f ilp via simd\n", state_char_evaluations / ((float)simd_iterations)); | |
return 0; | |
} | |
#undef DEBUG | |
int node_idx = 0; | |
node free_nodes[16384]; | |
node* alloc_node() { | |
return &free_nodes[node_idx++]; // :^) | |
} | |
void free_node(node* n) { | |
node_idx--; | |
if(n != &free_nodes[node_idx]) { | |
printf("Free node order error!\n"); | |
exit(1); | |
} | |
} | |
int is_letter(char c) { | |
return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')); | |
} | |
int is_digit(char c) { | |
return (c >= '0' && c <= '9'); | |
} | |
int is_meta_char(char c) { | |
return ((c == '(') || (c == ')') || (c == '^') || (c == '$') || (c == '?') || (c == '+') || (c == '*') || (c == '|')); | |
} | |
int is_valid_match_char(char c) { | |
return !is_meta_char(c) && c != '\0'; | |
} | |
/* | |
// recursive descent parser combinators (could be breadth-first like the regex evaluator but that would mean more work) | |
*/ | |
typedef struct { | |
u8 success; // successful parse? | |
node* n; // parsed node | |
char* rest; // string after what we parsed | |
} parse_res; | |
parse_res invalid_parse = {.success = 0}; | |
typedef struct parser parser; | |
typedef parse_res (*parser_func)(parser* p, char* a); | |
struct parser { | |
char* last_start_char; | |
char* name; | |
parser_func pf; | |
char c; | |
parser *left, *right, *middle; | |
u8 node_type; | |
}; | |
int parser_idx = 0; | |
parser free_parsers[16384]; | |
parser* alloc_parser(char* name, parser_func pf, char c, parser* left, parser* right, parser* middle, u8 node_type) { | |
parser* p = &free_parsers[parser_idx++]; | |
p->pf = pf; p->c = c; p->left = left; p->right = right; p->middle = middle; p->node_type = node_type; | |
p->name = name; | |
p->last_start_char = NULL; | |
return p; | |
} | |
int depth = 0; | |
parse_res apply_parser(parser* p, char* a) { | |
#ifdef DEBUG | |
for(int i = 0; i < depth; i++) { | |
printf(i == depth-1 ? "|-" : "| "); | |
} | |
printf("%s\n", p->name); | |
getchar(); | |
#endif | |
if(p->last_start_char == a) { | |
#ifdef DEBUG | |
for(int i = 0; i < depth; i++) { | |
printf(i == depth-1 ? "\\-" : "| "); | |
} | |
printf("Bailing out to prevent infinite recursion char ptr is %p\n", a); | |
#endif | |
p->last_start_char = NULL; | |
return invalid_parse; | |
} | |
p->last_start_char = a; | |
depth++; | |
parse_res pr = p->pf(p, a); | |
depth--; | |
#ifdef DEBUG | |
for(int i = 0; i < depth; i++) { | |
printf(i == depth-1 ? "\\-" : "| "); | |
} | |
#endif | |
// clear cache to prevent alternatives from failing | |
p->last_start_char = NULL; | |
#ifdef DEBUG | |
if(pr.success) { | |
printf("%s SUCCESS\n", p->name); | |
} else { | |
printf("%s FAIL\n", p->name); | |
} | |
#endif | |
return pr; | |
} | |
parse_res inner_parse_syntax_char(parser *p, char* a) { | |
#ifdef DEBUG | |
printf("checking '%c' vs expected '%c'\n", *a, p->c); | |
#endif | |
if(*a != p->c) { | |
return invalid_parse; | |
} | |
return ((parse_res){.success = 1, .rest = a+1}); | |
} | |
parse_res inner_parse_syntax_node(parser *p, char* a) { | |
#ifdef DEBUG | |
printf("checking '%c' vs expected '%c'\n", *a, p->c); | |
#endif | |
if(*a != p->c) { | |
return invalid_parse; | |
} | |
node* n = alloc_node(); | |
n->type = p->node_type; | |
return ((parse_res){.success = 1, .n = n, .rest = a+1}); | |
} | |
parse_res inner_parse_char(parser *p, char* a) { | |
if(!is_valid_match_char(*a)) { | |
return invalid_parse; | |
} | |
char match_char; | |
if(*a == '.') { | |
match_char = EPSILON_CHAR; | |
} else { | |
match_char = *a; | |
} | |
node* n = alloc_node(); | |
n->type = MATCH_CHAR; | |
n->c = match_char; | |
return ((parse_res) {.success = 1, .rest = a+1, .n = n}); | |
} | |
parser* make_parse_char(char* name) { | |
return alloc_parser(name, &inner_parse_char, 0, NULL, NULL, NULL, MATCH_CHAR); | |
} | |
parser* make_parse_syntax_char(char* name, char c) { | |
return alloc_parser(name, &inner_parse_syntax_char, c, NULL, NULL, NULL, 0); | |
} | |
parser* make_parse_syntax_node(char* name, char c, node_type n) { | |
return alloc_parser(name, &inner_parse_syntax_node, c, NULL, NULL, NULL, n); | |
} | |
parse_res inner_parse_wrapped(parser* p, char* a) { | |
parse_res pl = apply_parser(p->left, a); | |
if(!pl.success) { return invalid_parse; } | |
parse_res pm = apply_parser(p->middle, pl.rest); | |
if(!pm.success) { return invalid_parse; } | |
parse_res pr = apply_parser(p->right, pm.rest); | |
if(!pr.success) { return invalid_parse; } | |
return ((parse_res){.success = 1, .rest = pr.rest, .n = pm.n}); | |
} | |
parser* make_parse_wrapped(char* name, parser* left, parser* middle, parser* right) { | |
return alloc_parser(name, inner_parse_wrapped, 0, left, right, middle, NULL); | |
} | |
parse_res inner_parse_or(parser* p, char* a) { | |
parse_res pa = apply_parser(p->left, a); | |
if(!pa.success) { | |
return apply_parser(p->right, a); | |
} | |
return pa; | |
} | |
parser* make_parse_or(char* name, parser* a, parser* b) { | |
return alloc_parser(name, &inner_parse_or, 0, a, b, NULL, 0); | |
} | |
parse_res inner_parse_seq(parser* p, char* a) { | |
parse_res pa = apply_parser(p->left, a); | |
if(!pa.success) { | |
return invalid_parse; | |
} | |
parse_res pb = apply_parser(p->right, pa.rest); | |
if(!pb.success) { | |
return invalid_parse; | |
} | |
node* n = alloc_node(); | |
n->type = SEQ; | |
n->left = pa.n; | |
n->right = pb.n; | |
return ((parse_res){.success = 1, .rest = pb.rest, .n = n}); | |
} | |
parser* make_parse_seq(char* name, parser* a, parser* b) { | |
return alloc_parser(name, &inner_parse_seq, 0, a, b, NULL, SEQ); | |
} | |
parse_res inner_parse_unary_op(parser* p, char* a) { | |
parse_res expr = apply_parser(p->left, a); | |
if(!expr.success) { return invalid_parse; } | |
parse_res op = apply_parser(p->right, expr.rest); | |
if(!op.success) { return invalid_parse; } | |
node* n = alloc_node(); | |
n->type = p->node_type; | |
n->left = expr.n; | |
return ((parse_res){.success = 1, .n = n, .rest = op.rest}); | |
} | |
parser* make_parse_unary_op(char* name, parser* recur, parser* op, u8 node_type) { | |
return alloc_parser(name, &inner_parse_unary_op, 0, recur, op, NULL, node_type); | |
} | |
parse_res inner_parse_binary_op(parser* p, char* a) { | |
parse_res expr1 = apply_parser(p->left, a); | |
if(!expr1.success) { return invalid_parse; } | |
parse_res op = apply_parser(p->middle, expr1.rest); | |
if(!op.success) { return invalid_parse; } | |
parse_res expr2 = apply_parser(p->right, op.rest); | |
if(!expr2.success) { return invalid_parse; } | |
node* n = alloc_node(); | |
n->type = p->node_type; | |
n->left = expr1.n; | |
n->right = expr2.n; | |
return ((parse_res){.success = 1, .n = n, .rest = expr2.rest}); | |
} | |
parser* make_parse_binary_op(char* name, parser* lrecur, parser* op, parser* rrecur, u8 node_type) { | |
return alloc_parser(name, &inner_parse_binary_op, 0, lrecur, rrecur, op, node_type); | |
} | |
node* parse_pattern(char* a) { | |
parser* re; | |
parser* parse_group = make_parse_wrapped("GROUP", make_parse_syntax_char("LPAREN_CHAR", '('), re, make_parse_syntax_char("RPAREN_CHAR", ')')); | |
parser* parse_any = make_parse_syntax_node("ANY_CHAR", '.', MATCH_ANY); | |
parser* parse_eos = make_parse_syntax_node("EOS_CHAR", '$', RANCHOR); | |
parser* parse_char = make_parse_char("CHAR"); | |
parser* elementary_re = make_parse_or("ELEMENTARY_RE", parse_group, | |
make_parse_or("ELEMENTARY_RE2", parse_any, | |
make_parse_or("ELEMENTARY_RE3", parse_eos, parse_char))); | |
parser* parse_star = make_parse_unary_op("STAR", elementary_re, make_parse_syntax_char("STAR_CHAR", '*'), KLEENE); | |
parser* parse_plus = make_parse_unary_op("PLUS", elementary_re, make_parse_syntax_char("PLUS_CHAR", '+'), PLUS); | |
parser* parse_option = make_parse_unary_op("OPTION", elementary_re, make_parse_syntax_char("OPTION_CHAR", '?'), OPTION); | |
parser* basic_re = make_parse_or("BASIC_RE", parse_option, make_parse_or("BASIC_RE2", parse_star, make_parse_or("BASIC_RE3", parse_plus, elementary_re))); | |
// didn't write a function to do this recursion | |
parser* concatenation; | |
parser* simple_re = make_parse_or("SIMPLE_RE", concatenation, basic_re); | |
concatenation = make_parse_seq("CONCATENATION", basic_re, simple_re); | |
simple_re->left = concatenation; | |
// same here | |
parser* alternate = make_parse_binary_op("ALTERNATE", simple_re, make_parse_syntax_char("ALTERNATE_CHAR", '|'), re, ALTERNATE); | |
re = make_parse_or("RE", alternate, simple_re); | |
parse_group->middle = re; | |
alternate->right = re; | |
parser* top = make_parse_seq("TOP", re, make_parse_syntax_char("END_OF_PATTERN_CHAR", '\0')); | |
parse_res parsed = apply_parser(top, a); | |
if(!parsed.success) { | |
printf("Error parsing pattern\n"); | |
exit(1); | |
} | |
return parsed.n->left; | |
} | |
int main(int argc, char** argv) { | |
if(argc != 3) { | |
printf("Usage: ./preg pattern string\n"); | |
return 1; | |
} | |
printf("parsing...\n"); | |
node* parsed_tree = parse_pattern(argv[1]); | |
program p; | |
printf("parsed!\ncompiling...\n"); | |
compile_tree(parsed_tree, &p); | |
printf("compiled!\nevaluating...\n"); | |
#ifdef DEBUG | |
for(int i = 0; i < p.length; i++) { | |
print_inst(p.els[i]); printf(" '%c'\n", p.els[i]); | |
} | |
#endif | |
int matched = match(&p, argv[2]); | |
printf("evaluated.\n"); | |
if(matched) { | |
printf("MATCH (%i iterations)\n", matched); | |
} else { | |
printf("FAIL\n"); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment