Skip to content

Instantly share code, notes, and snippets.

@ehaliewicz
Created June 8, 2024 18:46
Show Gist options
  • Save ehaliewicz/f9d0d359fdff64ca4eec4ee723201742 to your computer and use it in GitHub Desktop.
Save ehaliewicz/f9d0d359fdff64ca4eec4ee723201742 to your computer and use it in GitHub Desktop.
reg thing
// no includes
typedef typeof(sizeof(0)) size_t;
#define NULL 0
int printf ( const char * format, ... );
void* memcpy(void *dest, const void * src, size_t n);
void exit(int);
typedef signed long s32;
typedef unsigned long u32;
typedef signed short s16;
typedef unsigned short u16;
typedef unsigned char u8;
typedef signed char s8;
typedef struct {
u32 success:1;
u32 tgt2_is_cur:1;
s32 tgt1:15;
s32 tgt2:15;
} inst;
#define MAX_PROG_LEN 32768
#define EOS 0
#define LANCHOR_CHAR 1
#define RANCHOR_CHAR 2
#define EPSILON_CHAR 3
// program is set up in two streams, instructions, and characters to match
// if an instruction being evaluated has the success flag set, we can bail out early with success
// if the character matches the current character, we can add tgt1 and tgt2 (relative targets) to the current and next set of states, respectively
// however if the tgt2_is_cur flag is set, tgt2 is instead added to the current set (this is used to fork to two locations at once without consuming characters)
typedef struct {
int length;
u8 chars[MAX_PROG_LEN];
inst insts[MAX_PROG_LEN];
} program;
typedef enum {
OPTION=0,
KLEENE=1,
PLUS=2,
SEQ=3,
ALTERNATE=4,
LANCHOR=5,
RANCHOR=6,
MATCH_CHAR=7,
MATCH_ANY=8
} node_type;
typedef struct node node;
struct node {
char c;
node_type type;
node *left, *right;
};
int compile_node(node* n, program* comp_output) {
char mc;
switch(n->type) {
case SEQ: do {
int ret = compile_node(n->left, comp_output);
compile_node(n->right, comp_output);
return ret;
} while(0);
break;
case RANCHOR: do {
int idx = comp_output->length;
comp_output->chars[idx] = RANCHOR_CHAR;
comp_output->insts[idx] = ((inst){.tgt1 = 1});
return comp_output->length++;
} while(0);
break;
case MATCH_ANY:
mc = EPSILON_CHAR;
case MATCH_CHAR: do {
mc = n->c;
//printf("CHAR OR ANY\n");
int idx = comp_output->length;
comp_output->chars[idx] = mc;
comp_output->insts[idx] = ((inst){.tgt2 = 1});
return comp_output->length++;
} while(0);
break;
case ALTERNATE: do {
int fork_loc = comp_output->length++;
int left_expr_loc = compile_node(n->left, comp_output);
int jump_loc = comp_output->length++;
int right_expr_loc = compile_node(n->right, comp_output);
int jump_tgt = comp_output->length;
// add fork to start of expression and past expression
comp_output->chars[fork_loc] = EPSILON_CHAR;
comp_output->insts[fork_loc] = ((inst){.tgt1 = (left_expr_loc-fork_loc),
.tgt2 = (right_expr_loc-fork_loc), .tgt2_is_cur = 1});
comp_output->chars[jump_loc] = EPSILON_CHAR;
comp_output->insts[jump_loc] = ((inst){.tgt1 = (jump_tgt - jump_loc)});
return fork_loc;
} while(0);
break;
case OPTION: do {
int fork_loc = comp_output->length++;
compile_node(n->left, comp_output);
int target = comp_output->length++;
comp_output->insts[fork_loc] = ((inst){.tgt1 = 1,.tgt2 = (fork_loc-target), .tgt2_is_cur=1});
comp_output->chars[fork_loc] = EPSILON_CHAR;
return fork_loc;
} while(0);
break;
case KLEENE: do {
int fork_loc = comp_output->length++;
int start_of_kleene_expr = compile_node(n->left, comp_output);
int after_loc = comp_output->length++;
int jump_tgt = after_loc+1;
// add fork to start of expression and past expression
comp_output->chars[fork_loc] = EPSILON_CHAR;
comp_output->insts[fork_loc] = ((inst){.tgt1 = (jump_tgt-fork_loc), .tgt2 = 1, .tgt2_is_cur = 1});
comp_output->chars[after_loc] = EPSILON_CHAR;
comp_output->insts[after_loc] = ((inst){.tgt1 = (fork_loc-after_loc)});
return fork_loc;
} while(0);
break;
default:
printf("Unexpected node type %i\n", n->type);
exit(1);
}
}
program* compile_tree(node* n, program* p) {
compile_node(n, p);
p->insts[p->length] = ((inst){.success=1});
p->chars[p->length++] = EPSILON_CHAR;
return p;
}
/*
vector ("SIMD") stuff
the idea here is to write this all in normal C, then convert to intrinsics
*/
typedef struct {
u16 els[8];
} vector_u16;
typedef struct {
u32 els[8];
} vector_u32;
typedef struct {
char els[8];
} vector_u8;
void print_inst(inst i){
printf("s%i:f%i:t%i:c%i", i.success, i.tgt1, i.tgt2, i.tgt2_is_cur);
}
void print_inst_vec(vector_u32 v) {
printf("<");
for(int i = 0; i < 8; i++) {
u32 el = v.els[i];
inst* iptr = (inst*)&el;
print_inst(*iptr); printf(", ");
}
printf(">\n");
}
void print_char_vec(vector_u8 v) {
printf("<");
for(int i = 0; i < 8; i++) {
printf("'%c' ", v.els[i]);
}
printf(">\n");
}
void print_bool_vec(vector_u8 v) {
printf("<");
for(int i = 0; i < 8; i++) {
printf("%i", v.els[i]?1:0);
}
printf(">\n");
}
u8 popcnt_u8_vec(vector_u8 v) {
char c = 0;
for(int i = 0; i < 8; i++) {
c += (v.els[i] ? 1 : 0);
}
return c;
}
vector_u16 load_u16s_vector_with_offset(u16* array, u16 offset) {
vector_u16 res;
for(int i = 0; i < 8; i++) {
res.els[i] = array[i+offset];
}
return res;
}
vector_u32 gather_u32s_idx_u16s(u32* array, vector_u16 idxs) {
vector_u32 u32s;
for(int i = 0; i < 8; i++) {
u32s.els[i] = array[idxs.els[i]];
}
return u32s;
}
vector_u8 gather_u8s_idx_u16s(u8* array, vector_u16 idxs) {
vector_u8 u8s;
for(int i = 0; i < 8; i++) {
u8s.els[i] = array[idxs.els[i]];
}
return u8s;
}
vector_u8 scalar_to_u8_vec(u8 a) {
return ((vector_u8){.els = {a,a,a,a,a,a,a,a}});
}
vector_u16 scalar_to_u16_vec(u8 a) {
return ((vector_u16){.els = {a,a,a,a,a,a,a,a}});
}
vector_u8 u8_vec_eq(vector_u8 a, vector_u8 b) {
vector_u8 res;
for(int ii = 0; ii < 8; ii++) {
res.els[ii] = a.els[ii] == b.els[ii];
}
return res;
}
vector_u8 u16_vec_neq(vector_u16 a, vector_u16 b) {
vector_u8 res;
for(int ii = 0; ii < 8; ii++) {
res.els[ii] = a.els[ii] == b.els[ii];
}
return res;
}
vector_u8 u8_vec_and(vector_u8 a, vector_u8 b) {
vector_u8 res;
for(int ii = 0; ii < 8; ii++) {
res.els[ii] = a.els[ii] & b.els[ii];
}
return res;
}
vector_u8 u8_vec_or(vector_u8 a, vector_u8 b) {
vector_u8 res;
for(int ii = 0; ii < 8; ii++) {
res.els[ii] = a.els[ii] | b.els[ii];
}
return res;
}
vector_u8 u8_vec_to_bits(vector_u8 a) {
vector_u8 res;
for(int i = 0; i < 8; i++) {
res.els[i] = a.els[i] ? 1 : 0;
}
return res;
}
#define min(a,b) ((a) < (b) ? (a) : (b))
char getchar();
//#define DEBUG
int match(program* p, const char *input) {
u8 cur_states_set[MAX_PROG_LEN/8] = { 0 };
u8 next_states_set[MAX_PROG_LEN/8] = { 0 };
u16 num_cur_states = 0;
u16 num_next_states = 0;
u16 cur_states[MAX_PROG_LEN] = { 0 };
u16 next_states[MAX_PROG_LEN] = { 0 };
cur_states_set[0] |= 1;
cur_states[num_cur_states++] = 0;
u8* prog_chars = p->chars;
inst* prog_insts = p->insts;
char* ip = (char*)input;
char cur_char = *ip++;
int state_iterations = 0;
int simd_iterations = 0;
int state_char_evaluations = 0;
do {
if(num_cur_states == 0) {
#ifdef DEBUG
printf("Out of states\n");
#endif
break;
}
state_iterations++;
#ifdef DEBUG
printf(" \n\n ITERATION %i\n", state_iterations);
printf(" CUR CHAR '%c' (idx: %li)\n\n", cur_char, ip-1-input);
#endif
int is_lanchor_char = (ip == input+1);
int is_ranchor_char = (cur_char == '\0');
int i = 0;
while(i < num_cur_states) {
int rem_states = num_cur_states - i;
int num_process_states = min(rem_states, 8);
simd_iterations++;
state_char_evaluations += num_process_states;
#ifdef DEBUG
printf("%i total states\n", num_cur_states);
printf("cur position in states: %i\n", i);
#endif
vector_u16 pcs = load_u16s_vector_with_offset(cur_states, i);
#ifdef DEBUG
printf("processing %i states\n", num_process_states);
printf("pcs: ");
printf("<");
print_pcs_vec(pcs);
printf(">\n");
#endif
vector_u8 is_valid_state = { .els = { 0, 0, 0, 0, 0, 0, 0, 0 } };
for(int ii = 0; ii < num_process_states; ii++) {
is_valid_state.els[ii] = 1;
}
#ifdef DEBUG
printf("valid states: ");
print_bool_vec(is_valid_state);
getchar();
#endif
vector_u32 insts = gather_u32s_idx_u16s((u32*)prog_insts, pcs);
vector_u8 chars = gather_u8s_idx_u16s(prog_chars, pcs);
vector_u8 success_bytes;
// there should be a function doing this via shifts rather than bit field access, oh well
for(int ii = 0; ii < 8; ii++) {
success_bytes = ((inst)(insts.els[ii])).success;
}
vector_u8 is_success = u8_vec_and(is_valid_state, u8_vec_to_bits(success_bytes));
#ifdef DEBUG
print_inst_vec(insts);
printf("successes: ");
print_bool_vec(is_success);
#endif
if(popcnt_u8_vec(is_success)) {
printf("states evaluated %i, state-char evaluations %i, simd iterations %i\n", state_iterations, state_char_evaluations, simd_iterations);
printf("%f ilp via simd\n", state_char_evaluations / ((float)simd_iterations));
return simd_iterations;
}
#ifdef DEBUG
print_char_vec(chars);
#endif
vector_u8 matches_char = u8_vec_eq(chars, scalar_to_u8_vec(cur_char));
vector_u8 is_epsilon = u8_vec_eq(chars, scalar_to_u8_vec(EPSILON_CHAR));
vector_u8 is_lanchor = u8_vec_eq(chars, scalar_to_u8_vec(LANCHOR_CHAR));
vector_u8 is_ranchor = u8_vec_eq(chars, scalar_to_u8_vec(RANCHOR_CHAR));
vector_u8 is_lit_char = u8_vec_to_bits(u8_vec_or(is_epsilon, u8_vec_or(is_lanchor, is_ranchor)));
//for(int ii = 0; ii < 8; ii++) {
//is_lit_char.els[ii] = (is_epsilon.els[ii] | is_lanchor.els[ii] | is_ranchor.els[ii]) ? 0 : 1;
//}
#ifdef DEBUG
printf("is epsilon: ");
print_bool_vec(is_epsilon);
printf("is lanchor: ");
print_bool_vec(is_lanchor);
printf("is ranchor: ");
print_bool_vec(is_ranchor);
printf("is lit char: ");
print_bool_vec(is_lit_char);
#endif
vector_u8 is_lanchor_and_matches_lanchor = u8_vec_and(is_lanchor, scalar_to_u8_vec(is_lanchor_char));
vector_u8 is_ranchor_and_matches_ranchor = u8_vec_and(is_ranchor, scalar_to_u8_vec(is_ranchor_char));
vector_u8 is_lit_char_and_matches_lit_char = u8_vec_and(is_lit_char, scalar_to_u8_vec(matches_char));
#ifdef DEBUG
printf("is lanchor and matches: ");
print_bool_vec(is_lanchor_and_matches_lanchor);
printf("is ranchor and matches: ");
print_bool_vec(is_ranchor_and_matches_ranchor);
printf("is lit char and matches: ");
print_bool_vec(is_lit_char_and_matches_lit_char);
#endif
vector_u8 matches = u8_vec_or(is_epsilon,
u8_vec_or(is_lanchor_and_matches_lanchor,
u8_vec_or(is_ranchor_and_matches_ranchor,
is_lit_char_and_matches_lit_char)));
#ifdef DEBUG
printf("got matches: ");
print_bool_vec(matches);
#endif
vector_u16 rel_tgt1, rel_tgt2;
for(int ii = 0; ii < 8; ii++) {
rel_tgt1.els[ii] = ((inst)(insts.els[ii])).tgt1;
rel_tgt2.els[ii] = ((inst)(insts.els[ii])).tgt2;
}
vector_u16 abs_tgt1 = u16_vec_add(rel_tgt1, pcs);
vector_u16 abs_tgt2 = u16_vec_add(rel_tgt2, pcs);
vector_u8 tgt1_valid = u16_vec_neq(rel_tgt1, scalar_to_u8_vec(0));
vector_u8 tgt2_valid = u16_vec_neq(rel_tgt1, scalar_to_u8_vec(0));
vector_u8 tgt1_cur_exists;
vector_u8 tgt2_next_exists;
vector_u8 tgt2_cur_exists;
for(int ii = 0; ii < 8; ii++) {
u16 tgt1 = abs_tgt1.pcs[ii];
u16 tgt2 = abs_tgt2.pcs[ii];
#ifdef DEBUG
printf("tgt1 %i, tgt2 %i\n", tgt1, tgt2);
#endif
u16 tgt1_byte_idx = (tgt1>>3);
u16 tgt2_byte_idx = (tgt2>>3);
u16 tgt1_bit_idx = (tgt1 & 0b111);
u16 tgt2_bit_idx = (tgt2 & 0b111);
tgt1_cur_exists.els[ii] = (cur_states_set[tgt1_byte_idx] & (1 << tgt1_bit_idx));
tgt2_next_exists.els[ii] = (next_states_set[tgt2_byte_idx] & (1 << tgt2_bit_idx));
#ifdef DEBUG
printf("css: %i\n", cur_states_set[tgt1_byte_idx]);
printf("tgt1_cur_exists %i\n", tgt1_cur_exists.els[ii]);
printf("nss: %i\n", next_states_set[tgt2_byte_idx]);
printf("tgt2_next_exists %i\n", tgt2_cur_exists.els[ii]);
#endif
tgt2_cur_exists.els[ii] = (cur_states_set[tgt2_byte_idx] & (1<<tgt2_bit_idx));
}
// we also need to cross reference WITHIN these bitmaps, so that we only add one of the same state at a time.
// but i don't feel like doing that right now
#ifdef DEBUG
printf("tgt1_cur_exists ");
print_bool_vec(tgt1_cur_exists);
printf("tgt2_next_exists ");
print_bool_vec(tgt2_next_exists);
printf("tgt2_cur_exists ");
print_bool_vec(tgt2_cur_exists);
#endif
vector_u8 add_to_next_set;
vector_u8 add_to_cur_set;
vector_u8 add_to_cur_set_tgt2;
for(int ii = 0; ii < 8; ii++) {
add_to_cur_set.els[ii] = (matches.els[ii] & tgt1_valid.els[ii] & (tgt1_cur_exists.els[ii] == 0) & is_valid_state.els[ii]);
add_to_next_set.els[ii] = (matches.els[ii] & tgt2_valid.els[ii] & (tgt2_next_exists.els[ii] == 0) & (!insts.els[ii].tgt2_is_cur) & is_valid_state.els[ii]);
add_to_cur_set_tgt2.els[ii] = (matches.els[ii] & tgt2_valid.els[ii] & (tgt2_cur_exists.els[ii] == 0) & (insts.els[ii].tgt2_is_cur) & is_valid_state.els[ii]);
}
#ifdef DEBUG
printf("add to cur set: ");
print_bool_vec(add_to_cur_set);
printf("add to cur set tgt2: ");
print_bool_vec(add_to_cur_set_tgt2);
printf("add to next set: ");
print_bool_vec(add_to_next_set);
#endif
vector_u8 masks[8] = {
((vector_u8){.els = { 0, 0, 0, 0, 0, 0, 0, 0}}),
((vector_u8){.els = { 1, 0, 0, 0, 0, 0, 0, 0}}),
((vector_u8){.els = { 1, 1, 0, 0, 0, 0, 0, 0}}),
((vector_u8){.els = { 1, 1, 1, 0, 0, 0, 0, 0}}),
((vector_u8){.els = { 1, 1, 1, 1, 0, 0, 0, 0}}),
((vector_u8){.els = { 1, 1, 1, 1, 1, 0, 0, 0}}),
((vector_u8){.els = { 1, 1, 1, 1, 1, 1, 0, 0}}),
((vector_u8){.els = { 1, 1, 1, 1, 1, 1, 1, 0}}),
};
vector_u8 rel_cur_state_idxs, rel_cur2_state_idxs;
vector_u8 rel_next_state_idxs;
for(int ii = 0; ii < 8; ii++) {
vector_u8 masked_cur, masked_next;
for(int jj = 0; jj < 8; jj++) {
masked_cur.els[jj] = masks[ii].els[jj] & add_to_cur_set.els[jj];
masked_next.els[jj] = masks[ii].els[jj] & add_to_next_set.els[jj];
}
rel_cur_state_idxs.els[ii] = popcnt_char_vec(masked_cur);
rel_next_state_idxs.els[ii] = popcnt_char_vec(masked_next);
}
#ifdef DEBUG
printf("rel cur state idxs: ");
print_bool_vec(rel_cur_state_idxs);
printf("rel next state idxs: ");
print_bool_vec(rel_next_state_idxs);
printf("\n\n");
#endif
for(int ii = 0; ii < 8; ii++) {
if(add_to_cur_set.els[ii]) {
s16 tgt1 = insts.els[ii].tgt1 + pcs.pcs[ii];
cur_states_set[tgt1>>3] |= (1 << (tgt1&0b111));
#ifdef DEBUG
printf("tgt1 %i css: %i\n", tgt1, cur_states_set[tgt1>>3]);
printf("should be zero %i\n", (cur_states_set[tgt1>>3] & (1 << (tgt1&0b111))));
#endif
cur_states[num_cur_states + rel_cur_state_idxs.els[ii]] = insts.els[ii].tgt1 + pcs.pcs[ii];
}
if(add_to_next_set.els[ii]) {
s16 tgt2 = insts.els[ii].tgt2 + pcs.pcs[ii];
next_states_set[tgt2>>3] |= (1 << (tgt2&0b111));
next_states[num_next_states + rel_next_state_idxs.els[ii]] = insts.els[ii].tgt2 + pcs.pcs[ii];
}
}
num_cur_states += popcnt_char_vec(add_to_cur_set);
num_next_states += popcnt_char_vec(add_to_next_set);
for(int ii = 0; ii < 8; ii++) {
vector_u8 masked_cur;
for(int jj = 0; jj < 8; jj++) {
masked_cur.els[jj] = masks[ii].els[jj] & add_to_cur_set_tgt2.els[jj];
}
rel_cur2_state_idxs.els[ii] = popcnt_char_vec(masked_cur);
}
#ifdef DEBUG
printf("rel cur2 state idxs: ");
print_bool_vec(rel_cur2_state_idxs);
#endif
for(int ii = 0; ii < 8; ii++) {
if(add_to_cur_set_tgt2.els[ii]) {
s16 tgt2 = insts.els[ii].tgt2 + pcs.pcs[ii];
cur_states_set[tgt2>>3] |= (1 << (tgt2&0b111));
cur_states[num_cur_states + rel_cur2_state_idxs.els[ii]] = insts.els[ii].tgt2 + pcs.pcs[ii];
}
}
for(int ii = 0; ii < MAX_PROG_LEN/8; ii++) {
cur_states_set[ii] = next_states_set[ii];
next_states_set[ii] = 0;
}
num_cur_states += popcnt_char_vec(add_to_cur_set_tgt2);
i += num_process_states;
}
num_cur_states = num_next_states;
memcpy(cur_states, next_states, sizeof(u16)*num_cur_states);
num_next_states = 0;
if(cur_char == '\0') {
break;
}
cur_char = *ip++;
} while(1);
printf("states evaluated %i, state-char evaluations %i, simd iterations %i\n", state_iterations, state_char_evaluations, simd_iterations);
printf("%f ilp via simd\n", state_char_evaluations / ((float)simd_iterations));
return 0;
}
#undef DEBUG
int node_idx = 0;
node free_nodes[16384];
node* alloc_node() {
return &free_nodes[node_idx++]; // :^)
}
void free_node(node* n) {
node_idx--;
if(n != &free_nodes[node_idx]) {
printf("Free node order error!\n");
exit(1);
}
}
int is_letter(char c) {
return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'));
}
int is_digit(char c) {
return (c >= '0' && c <= '9');
}
int is_meta_char(char c) {
return ((c == '(') || (c == ')') || (c == '^') || (c == '$') || (c == '?') || (c == '+') || (c == '*') || (c == '|'));
}
int is_valid_match_char(char c) {
return !is_meta_char(c) && c != '\0';
}
/*
// recursive descent parser combinators (could be breadth-first like the regex evaluator but that would mean more work)
*/
typedef struct {
u8 success; // successful parse?
node* n; // parsed node
char* rest; // string after what we parsed
} parse_res;
parse_res invalid_parse = {.success = 0};
typedef struct parser parser;
typedef parse_res (*parser_func)(parser* p, char* a);
struct parser {
char* last_start_char;
char* name;
parser_func pf;
char c;
parser *left, *right, *middle;
u8 node_type;
};
int parser_idx = 0;
parser free_parsers[16384];
parser* alloc_parser(char* name, parser_func pf, char c, parser* left, parser* right, parser* middle, u8 node_type) {
parser* p = &free_parsers[parser_idx++];
p->pf = pf; p->c = c; p->left = left; p->right = right; p->middle = middle; p->node_type = node_type;
p->name = name;
p->last_start_char = NULL;
return p;
}
int depth = 0;
parse_res apply_parser(parser* p, char* a) {
#ifdef DEBUG
for(int i = 0; i < depth; i++) {
printf(i == depth-1 ? "|-" : "| ");
}
printf("%s\n", p->name);
getchar();
#endif
if(p->last_start_char == a) {
#ifdef DEBUG
for(int i = 0; i < depth; i++) {
printf(i == depth-1 ? "\\-" : "| ");
}
printf("Bailing out to prevent infinite recursion char ptr is %p\n", a);
#endif
p->last_start_char = NULL;
return invalid_parse;
}
p->last_start_char = a;
depth++;
parse_res pr = p->pf(p, a);
depth--;
#ifdef DEBUG
for(int i = 0; i < depth; i++) {
printf(i == depth-1 ? "\\-" : "| ");
}
#endif
// clear cache to prevent alternatives from failing
p->last_start_char = NULL;
#ifdef DEBUG
if(pr.success) {
printf("%s SUCCESS\n", p->name);
} else {
printf("%s FAIL\n", p->name);
}
#endif
return pr;
}
parse_res inner_parse_syntax_char(parser *p, char* a) {
#ifdef DEBUG
printf("checking '%c' vs expected '%c'\n", *a, p->c);
#endif
if(*a != p->c) {
return invalid_parse;
}
return ((parse_res){.success = 1, .rest = a+1});
}
parse_res inner_parse_syntax_node(parser *p, char* a) {
#ifdef DEBUG
printf("checking '%c' vs expected '%c'\n", *a, p->c);
#endif
if(*a != p->c) {
return invalid_parse;
}
node* n = alloc_node();
n->type = p->node_type;
return ((parse_res){.success = 1, .n = n, .rest = a+1});
}
parse_res inner_parse_char(parser *p, char* a) {
if(!is_valid_match_char(*a)) {
return invalid_parse;
}
char match_char;
if(*a == '.') {
match_char = EPSILON_CHAR;
} else {
match_char = *a;
}
node* n = alloc_node();
n->type = MATCH_CHAR;
n->c = match_char;
return ((parse_res) {.success = 1, .rest = a+1, .n = n});
}
parser* make_parse_char(char* name) {
return alloc_parser(name, &inner_parse_char, 0, NULL, NULL, NULL, MATCH_CHAR);
}
parser* make_parse_syntax_char(char* name, char c) {
return alloc_parser(name, &inner_parse_syntax_char, c, NULL, NULL, NULL, 0);
}
parser* make_parse_syntax_node(char* name, char c, node_type n) {
return alloc_parser(name, &inner_parse_syntax_node, c, NULL, NULL, NULL, n);
}
parse_res inner_parse_wrapped(parser* p, char* a) {
parse_res pl = apply_parser(p->left, a);
if(!pl.success) { return invalid_parse; }
parse_res pm = apply_parser(p->middle, pl.rest);
if(!pm.success) { return invalid_parse; }
parse_res pr = apply_parser(p->right, pm.rest);
if(!pr.success) { return invalid_parse; }
return ((parse_res){.success = 1, .rest = pr.rest, .n = pm.n});
}
parser* make_parse_wrapped(char* name, parser* left, parser* middle, parser* right) {
return alloc_parser(name, inner_parse_wrapped, 0, left, right, middle, NULL);
}
parse_res inner_parse_or(parser* p, char* a) {
parse_res pa = apply_parser(p->left, a);
if(!pa.success) {
return apply_parser(p->right, a);
}
return pa;
}
parser* make_parse_or(char* name, parser* a, parser* b) {
return alloc_parser(name, &inner_parse_or, 0, a, b, NULL, 0);
}
parse_res inner_parse_seq(parser* p, char* a) {
parse_res pa = apply_parser(p->left, a);
if(!pa.success) {
return invalid_parse;
}
parse_res pb = apply_parser(p->right, pa.rest);
if(!pb.success) {
return invalid_parse;
}
node* n = alloc_node();
n->type = SEQ;
n->left = pa.n;
n->right = pb.n;
return ((parse_res){.success = 1, .rest = pb.rest, .n = n});
}
parser* make_parse_seq(char* name, parser* a, parser* b) {
return alloc_parser(name, &inner_parse_seq, 0, a, b, NULL, SEQ);
}
parse_res inner_parse_unary_op(parser* p, char* a) {
parse_res expr = apply_parser(p->left, a);
if(!expr.success) { return invalid_parse; }
parse_res op = apply_parser(p->right, expr.rest);
if(!op.success) { return invalid_parse; }
node* n = alloc_node();
n->type = p->node_type;
n->left = expr.n;
return ((parse_res){.success = 1, .n = n, .rest = op.rest});
}
parser* make_parse_unary_op(char* name, parser* recur, parser* op, u8 node_type) {
return alloc_parser(name, &inner_parse_unary_op, 0, recur, op, NULL, node_type);
}
parse_res inner_parse_binary_op(parser* p, char* a) {
parse_res expr1 = apply_parser(p->left, a);
if(!expr1.success) { return invalid_parse; }
parse_res op = apply_parser(p->middle, expr1.rest);
if(!op.success) { return invalid_parse; }
parse_res expr2 = apply_parser(p->right, op.rest);
if(!expr2.success) { return invalid_parse; }
node* n = alloc_node();
n->type = p->node_type;
n->left = expr1.n;
n->right = expr2.n;
return ((parse_res){.success = 1, .n = n, .rest = expr2.rest});
}
parser* make_parse_binary_op(char* name, parser* lrecur, parser* op, parser* rrecur, u8 node_type) {
return alloc_parser(name, &inner_parse_binary_op, 0, lrecur, rrecur, op, node_type);
}
node* parse_pattern(char* a) {
parser* re;
parser* parse_group = make_parse_wrapped("GROUP", make_parse_syntax_char("LPAREN_CHAR", '('), re, make_parse_syntax_char("RPAREN_CHAR", ')'));
parser* parse_any = make_parse_syntax_node("ANY_CHAR", '.', MATCH_ANY);
parser* parse_eos = make_parse_syntax_node("EOS_CHAR", '$', RANCHOR);
parser* parse_char = make_parse_char("CHAR");
parser* elementary_re = make_parse_or("ELEMENTARY_RE", parse_group,
make_parse_or("ELEMENTARY_RE2", parse_any,
make_parse_or("ELEMENTARY_RE3", parse_eos, parse_char)));
parser* parse_star = make_parse_unary_op("STAR", elementary_re, make_parse_syntax_char("STAR_CHAR", '*'), KLEENE);
parser* parse_plus = make_parse_unary_op("PLUS", elementary_re, make_parse_syntax_char("PLUS_CHAR", '+'), PLUS);
parser* parse_option = make_parse_unary_op("OPTION", elementary_re, make_parse_syntax_char("OPTION_CHAR", '?'), OPTION);
parser* basic_re = make_parse_or("BASIC_RE", parse_option, make_parse_or("BASIC_RE2", parse_star, make_parse_or("BASIC_RE3", parse_plus, elementary_re)));
// didn't write a function to do this recursion
parser* concatenation;
parser* simple_re = make_parse_or("SIMPLE_RE", concatenation, basic_re);
concatenation = make_parse_seq("CONCATENATION", basic_re, simple_re);
simple_re->left = concatenation;
// same here
parser* alternate = make_parse_binary_op("ALTERNATE", simple_re, make_parse_syntax_char("ALTERNATE_CHAR", '|'), re, ALTERNATE);
re = make_parse_or("RE", alternate, simple_re);
parse_group->middle = re;
alternate->right = re;
parser* top = make_parse_seq("TOP", re, make_parse_syntax_char("END_OF_PATTERN_CHAR", '\0'));
parse_res parsed = apply_parser(top, a);
if(!parsed.success) {
printf("Error parsing pattern\n");
exit(1);
}
return parsed.n->left;
}
int main(int argc, char** argv) {
if(argc != 3) {
printf("Usage: ./preg pattern string\n");
return 1;
}
printf("parsing...\n");
node* parsed_tree = parse_pattern(argv[1]);
program p;
printf("parsed!\ncompiling...\n");
compile_tree(parsed_tree, &p);
printf("compiled!\nevaluating...\n");
#ifdef DEBUG
for(int i = 0; i < p.length; i++) {
print_inst(p.els[i]); printf(" '%c'\n", p.els[i]);
}
#endif
int matched = match(&p, argv[2]);
printf("evaluated.\n");
if(matched) {
printf("MATCH (%i iterations)\n", matched);
} else {
printf("FAIL\n");
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment