Skip to content

Instantly share code, notes, and snippets.

@bl0ckeduser
Created August 25, 2012 20:36
Show Gist options
  • Save bl0ckeduser/3470729 to your computer and use it in GitHub Desktop.
Save bl0ckeduser/3470729 to your computer and use it in GitHub Desktop.
NFA wannabe regex
/*
* Wannabe regex (egrep-style)
* Wannabe automatic regex tokenizer
* Wannabe egrep command (./a.out --wgr 'pattern')
*
* The wannabe egrep can make a picture of the compiled NFA
* this way:
* ./a.out --wgr 'pattern' --pic | pic | troff -ms
*
* Supported 'special' characters:
* ? + * . ^ $ \
* the backslash is to escape special status.
* For their exact meaning, see the compiler
* routine, add_token().
*
* (troff output has to be converted to some
* useful format: e.g. on my computer I pipe troff
* to "grops" [provided by free package groff]
* to get postscript)
*
* Now with support for multiple epsilon edges
* as well as lazy/greedy/minimalist choice-mode
* choice. (Yeah, you get to choose how the program
* chooses). Please note that it uses NFAs, which
* are inferior to DFAs. Nasty backtracking involved.
* There's some algorithm with matrices and stuff to
* convert, but that'll be for another time. I'm not
* even a CS student or professional or teacher or
* anything, I'm a kid, give me a little time.
*
* One day I should try to rewrite this in Lisp.
*
* To respect a certain idiom, greedy is the default
* choice-mode. Minimalist mode tries to get as few
* characters as possible, while lazy just gives you
* the first thing it finds.
* (Are they actually the same ?)
*
* Bl0ckeduser
* August 2012
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int VERBOSE = 0;
/*
* Some literature
*
* "Each state of a nondeterministic automaton contains a list of `Accept'
* values, a list of epsilon transitions (an epsilon transition represents a
* transition to another state that can be made without reading a character)
* and a list of transitions qualified with a character predicate (the
* transition can only be made to the given state on input of a character
* permitted by the predicate). Although a list of `Accept' values is provided
* for, in actual fact each state will have zero or one of them (the `Maybe'
* type is not used because the flexibility offered by the list representation
* is useful)."
* Source: NFA.hs, part of Alex - (c) Chris Dornan 1995-2000, Simon Marlow 2003
* http://hackage.haskell.org/packages/archive/alex-meta/latest/doc/html/src/NFA.html
*/
/*
* Programming notes / to-do's
*
* - What does '.+?' mean ? GNU egrep seems to do it
* differently. Is it BS ? Or am I mis-compiling ?
* Should I check for stuff that makes no sense ?
* Is there a systematic, clean way to do this ?
*
* - NFAs / backtracking are said to be a poor way to
* do regex. DFAs are said to be better. DFA next
* time, I guess. (As restated in header above).
*/
/*
* This is what gets returned when
* a string is found to match a
* regex pattern. The data therein
* is mostly useful for a tokenizer
* wanting the length of the match,
* since the wannabe egrep only needs to
* know whether match of its single loaded
* pattern succeded. If token == 0,
* no match was made.
*/
typedef struct match_struct {
/* Accept number.*/
int token;
/* Last character in the match. */
char* pos;
} match_t;
/*
* This is the NFA node data structure.
* It includes outgoing edges as pointers
* to other structures, and an accept
* number.
*/
typedef struct trie_struct {
/*
* The evaluator will try all these edges:
*
* - map[c] : standard character edge,
* where c is the current
* character
*
* - link[0], link[1], ..., link[links - 1]:
* these are in fact epsilon edges
*
* - caret (see below)
*
* When several choices exist, the evaluator
* will go through all possible paths and then
* choose the "best" successful match according
* to the chosen choice-mode (greed/lazy/mini).
*/
struct trie_struct* map[256];
struct trie_struct** link;
int links;
int link_alloc;
/*
* Special "character" edge for the
* beggining of the string, created
* by the '^' meta-character.
*/
struct trie_struct* caret;
/*
* Evaluating a matching pattern eventually
* leads to a node with valid_token set
* to the a number associated with the originating
* regex. Other nodes have this set to 0.
*/
int valid_token;
} trie;
void traverse(trie *);
void printout(void);
void pic_printout(void);
void clear_trie_list(void);
/* choice-modes */
enum {
LAZY = 0,
MINIMALIST,
GREEDY
};
void smartass_putchar(int c)
{
if (!isprint(c))
printf("[0x%x]", c);
else
putchar(c);
}
void fail(char* mesg)
{
fprintf(stderr, "error: %s\n", mesg);
exit(1);
}
void sanity_requires(int exp)
{
if(!exp)
fail("that doesn't make any sense");
}
void add_link(trie* from, trie* to)
{
int i;
/* check for redundant link */
for (i = 0; i < from->links; i++)
if (from->link[i] == to)
return;
if (++from->links > from->link_alloc)
from->link = realloc(from->link, (from->link_alloc += 5) * sizeof(trie *));
if (!from->link)
fail("link array allocation");
from->link[from->links - 1] = to;
}
trie* new_trie()
{
trie* t = malloc(sizeof(trie));
int i;
if (!t)
fail("trie allocation");
t->link = malloc(10 * sizeof(trie*));
if (!t->link)
fail("trie links");
t->links = 0;
t->link_alloc = 10;
for (i = 0; i < 10; i++)
t->link[i] = NULL;
t->valid_token = 0;
t->caret = NULL;
for(i = 0; i < 256; i++)
t->map[i] = NULL;
return t;
}
/*
* Compiler routine.
*
* Creates and adds the edges and nodes
* for regex pattern 'tok' to the
* starting node 't', as well as
* an accept state identified by
* number 'key'.
*/
int add_token(trie* t, char* tok, int key)
{
int i, j;
int k;
int c;
trie *hist[1024];
trie *next;
int len = strlen(tok) + 1;
char* new_tok;
int dollar = 0;
int n;
new_tok = malloc(len + 32);
if (!new_tok)
fail("buffer allocation");
strcpy(new_tok, tok);
/* Rewrite $ as terminating null */
for (i = 0; i < len; ++i) {
switch (new_tok[i]) {
case '\\':
if (!new_tok[++i])
fail("backslash expected char");
break;
case '$':
new_tok[i] = 0;
len = i + 1;
dollar = 1;
}
}
/* Otherwise, make the terminating null
* optional to allow for sub-matches
*/
if (!dollar)
new_tok[len++] = '?';
for (i = 0, n = 0; c = new_tok[i], i < len && t; ++i)
{
/*
* 'n' is node index, which is distinct
* from source regex character index ('i') because
* meta-characters do not create new nodes.
*
* Although matching traverses the tree in a
* messy, complicated fashion due to backtracking,
* which would make a node index nonsensical for it,
* compilation creates the nodes in a simple, linear
* fashion (this loop) and has to refer to previous nodes
* when compiling meta-characters.
*/
hist[n] = t;
switch (c) {
case '?': /* optional character */
sanity_requires(n > 0);
add_link(hist[n - 1], t);
break;
case '+': /* character can repeat
any number of times */
sanity_requires(n > 0);
add_link(t, hist[n - 1]);
break;
case '*': /* optional character
with repetition allowed */
sanity_requires(n > 0);
add_link(t, hist[n - 1]);
add_link(hist[n- 1], t);
break;
case '.': /* any normal character */
next = new_trie();
for(j = 0; j < 256; j++)
t->map[j] = next;
t = next;
++n;
break;
case '^': /* beggining of line */
if (!t->caret)
t->caret = new_trie();
t = t->caret;
++n;
break;
case '\\': /* escape to normal text */
if (++i == len)
fail("backslash expected char");
c = new_tok[i];
default: /* normal text */
if (!t->map[c])
t->map[c] = new_trie();
t = t->map[c];
++n;
}
}
free(new_tok);
/*
* We have reached the accept-state node;
* mark it as such.
*/
t->valid_token = key;
}
match_t match_setup(trie* t)
{
clear_trie_list();
traverse(t);
}
/*
* NFA trie evaluator, with backtracking,
* implementation-time hacks, parameters and all
* (aka bloat)
*/
match_t match(trie* t, char* full_tok, char* tok, int btm, int vd, int mode, trie* led)
{
int i = 0;
int j;
char c;
/* choice according to mode */
int record;
match_t m;
match_t ml[10];
int mc = 0;
match_t* choice;
extern int trie_index(trie* t);
int len = strlen(tok) + 1;
int init = btm;
i = 0; do
{
c = tok[i];
/*
* Give some info on the current
* character and node for debugging
* purposes
*/
if (VERBOSE) {
if (init) {
printf("[fork %d:%d - c=", vd, init);
smartass_putchar(c);
printf(",\tn=%d] ", trie_index(t));
} else {
printf("At node %d; ", trie_index(t), t);
printf("have ");
smartass_putchar(c);
printf(" (abs. index %d)\n", &tok[i] - full_tok);
}
}
/* Follow caret edge */
if (&tok[i] == full_tok && t->caret)
t = t->caret;
if (t->links && !btm)
{
/*
* Ambiguous. Try all possibilites by "forking"
* (informal sense).
*/
if (VERBOSE)
printf("Ambiguous fork; trying all edges\n");
/* Try character edge if it exists */
if (t->map[c])
if ((m = match(t, full_tok, tok + i, 1, vd + 1, mode, led)).token)
ml[mc++] = m;
/* Try all the epsilon nodes */
for (j = 0; j < t->links; j++)
if ((m = match(t, full_tok, tok + i, j + 2, vd + 1, mode, led)).token)
ml[mc++] = m;
/*
* If there are multiple matches, we
* choose based on the mode.
*/
if (mc > 1) {
switch (mode) {
/* Just give the first match */
case LAZY:
return *ml;
/* Give the shortest match */
case MINIMALIST:
record = strlen(full_tok) + 1;
choice = NULL;
for (j = 0; j < mc; j++)
if (ml[j].pos - full_tok <= record) {
choice = &ml[j];
record = ml[j].pos - full_tok;
}
if (!choice) fail("lazy choice");
return *choice;
/* Give the longest match */
case GREEDY:
record = 0;
choice = NULL;
for (j = 0; j < mc; j++)
if (ml[j].pos - full_tok >= record) {
choice = &ml[j];
record = ml[j].pos - full_tok;
}
if (!choice) fail("greedy choice");
return *choice;
default:
fail("match() call mode number");
}
}
else if (mc == 1) /* No ambiguity, no time wasted choosing */
return *ml;
else /* For non-matches (they are never pushed
to the decision list) */
return m;
}
if (btm <= 1 && t->map[c]) {
if (VERBOSE) {
printf("Following ");
smartass_putchar(c);
printf(" edge\n");
}
t = t->map[c];
/*
* Does this have to fork ???
*/
if (t->valid_token) break;
++i;
goto valid;
}
else if (btm > 1) {
if (VERBOSE)
printf("Trying to follow epsilon edge %d\n", btm - 2 + 1);
/* prevents infinite loops */
if (!led || t->link[btm - 2] != led) {
led = t; /* Last Epsilon Departure
node */
t = t->link[btm - 2];
goto valid;
} else {
if (VERBOSE)
printf("... won't, I just was there.\n");
}
}
break;
/* The fork path choice is only
* relevant for the first ambiguous
* choice after the fork
*/
valid: if (btm)
btm = 0;
} while (i < len && t);
if (t && t->valid_token) {
m.token = t->valid_token;
m.pos = tok + i;
if (VERBOSE)
printf("Matched !\n");
} else {
m.token = 0;
if (VERBOSE)
printf("No good\n");
}
return m;
}
char* theregex;
int main(int argc, char** argv)
{
trie* t = new_trie();
match_t m;
/* tokenizer shell */
char buf[1024];
char cmd[1024];
char nam[1024];
char expnames[1024][1024];
int i = 0, j =0;
int mod = GREEDY; /* a Un*x idiom, innit ? */
/* wannabe-egrep */
int wgr = 0;
char *p;
if (argc > 1 && !strcmp(argv[1], "--wgr")) {
/* wannabe-egrep mode */
wgr = 1;
if (argc == 2)
add_token(t, theregex = "", 1);
else
add_token(t, theregex = argv[2], 1);
}
for (i = 0; i < argc; i++)
if (!strcmp(argv[i], "--show"))
{
traverse(t);
printout();
return;
} else if (!strcmp(argv[i], "--pic"))
{
traverse(t);
pic_printout();
return;
}
strcpy(*expnames, "(failure)");
while (1) {
if(!wgr) {
printf("%c ", '%');
fflush(stdout);
}
fgets(buf, 1024, stdin);
if (feof(stdin)) break;
/* the meat of the wannabe-egrep */
if (wgr) {
/* truncate newline */
for (p = buf; *p; ++p)
if (*p == '\n')
*p = 0;
/*
* Traverse the tree and assign pretty
* numbers to all encountered node pointers
* to make debugging less painful.
* (You get to hear about "node 2" instead of
* "node pointed to by 0x129235". This scheme
* is also used by the picture mode, where
* you would e.g. see a circle with the number 2).
*/
match_setup(t);
/*
* The loop pushes forward the
* substring index after every
* failed match attempt.
*/
p = buf;
do {
if (VERBOSE && p != buf)
printf("\n--------\n"
"trying submatch - index %d\n",
(int)(p - buf));
if (match(t, buf, p, 0, 0, GREEDY, NULL).token) {
/* The line matches, so print it */
puts(buf);
break;
}
} while (*p++);
} else {
/* regex tokenizer shell mode */
*nam = *cmd = 0;
if(sscanf(buf, "%s %s", cmd, nam) <= 0) {
puts("sorry, I need a command and an argument");
continue;
}
/* add a new regex token */
if(!strcmp(cmd, "add")) {
add_token(t, nam, ++i);
strcpy(expnames[i], nam);
printf("ok\n", i);
}
/* set choice-mode */
else if(!strcmp(cmd, "mode")) {
if (!strcmp(nam, "lazy"))
mod = LAZY;
else if (!strcmp(nam, "min"))
mod = MINIMALIST;
else if (!strcmp(nam, "greedy"))
mod = GREEDY;
else if (!*nam || *nam=='?')
puts(mod == LAZY ? "LAZY" :
mod == GREEDY ? "GREEDY":
"META_LAZY");
else puts("???");
}
/* flip verbosity */
else if(!strcmp(cmd, "verbose")) {
VERBOSE = !!!VERBOSE;
}
/* try to match a string */
else if(!strcmp(cmd, "match")) {
if (i == 1) {
printf("no matching patterns installed\n");
continue;
}
match_setup(t);
m = match (t, nam, nam, 0, 0, mod, NULL);
if (m.token == 0) {
puts("no match");
continue;
}
printf("pattern '%s': ", expnames[m.token]);
putchar('\'');
for (j = 0; j < (int)(m.pos - nam); j++)
putchar(nam[j]);
putchar('\''); putchar('\n');
}
else printf(" ??? \n");
}
}
if(!wgr)
putchar('\n');
return 0;
}
/* Down here we have the
* pretty-numbers-for-pointers code
* and routines that display either
* simple text representation of a compiled
* tree or a generate pic source for
* a nice picture of the tree (with arrows,
* circles, and all).
*/
trie* tries[1024] = {NULL};
int tc = 1;
int trie_index(trie* t)
{
int i;
for (i = 1; i < tc; i++)
if (tries[i] == t)
return i;
return 0;
}
int trie_in_list(trie* t)
{
int i;
for (i = 1; i < tc; i++)
if (tries[i] == t)
return 1;
return 0;
}
void clear_trie_list(void)
{
tc = 1;
}
/* recursively look for all existing
* tree-nodes and assign unique pretty
* natural numbers to them, for
* human reader / picture viewer
* convenience.
*/
void traverse(trie* t)
{
int i, j;
int e;
if (!trie_in_list(t))
tries[tc++] = t;
if (t->caret && !trie_in_list(t->caret))
traverse(t->caret);
for (i = 0; i < 256; i++)
if (t->map[i] && !trie_in_list(t->map[i])) {
tries[tc++] = t->map[i];
traverse(t->map[i]);
}
for (i = 0; i < t->links; i++)
if (!trie_in_list(t->link[i]))
traverse(t->link[i]);
}
/* pointer -> pretty number */
int index_in_list(trie* t)
{
int i;
for (i = 1; i < tc; i++)
if (tries[i] == t)
return i;
fail("trie list lookup");
}
void printout() /* text representation of trie */
{
int i, j;
int chk;
trie* tt;
trie* t;
for (i = 1; i < tc; i++)
{
printf("%d. ", i);
t = tries[i];
if (!t) break;
if (t->links) /* [epsilon edge destination node number] */
for (j = 0; j < t->links; j++)
printf (" [%d] ", index_in_list(t->link[j]));
if (t->valid_token) /* ::accept-state number */
printf(" ::%d ", t->valid_token);
if (t->caret) /* caret alphabet edge -> dest. node num. */
printf(" caret->%d", index_in_list(t->caret));
chk = 1;
tt = t->map[0];
for (j = 0; j < 256; j++)
if (t->map[j] != tt || !t->map[j])
{
chk = 0;
break;
}
if (chk) { /* all alphabet edges -> dest node num. */
printf(" .-> %d ", index_in_list(t->map[0]));
} else /* specific characters -> dest nonde nums. */
for (j = 0; j < 256; j++)
if (t->map[j])
printf(" %c->%d ", j, index_in_list(t->map[j]));
putchar('\n');
}
}
/* makes pic source for a picture of the tree */
void pic_printout()
{
/* registers, buffers, herp derp derp */
int i, j;
int chk;
trie* tt;
trie* t;
int cc;
int no;
char buf[1024], buf2[1024];
struct {
char links[1024]; /* caption listing
the characters that
share this edge */
int lc; /* (count of above) */
} refs[1024]; /* list of alphabet edges for this node,
indexed by destination node. */
puts(".PS");
/* Write out the originating regex pattern
and then move away to make space for the
actual picture */
printf("\"%s\"\n", theregex);
puts("down; move 1.0; right");
/* Draw circles for all of the
nodes */
for (i = 1; i < tc; i++)
printf("circle \"%d\"; move 0.75\n", i);
/* Find the accept node and draw a
nested concentric circle there */
for (i = 1; i < tc; i++)
if (tries[i]->valid_token)
printf("circle rad 0.2 at %dth circle\n", i);
/* Mark the starting node with a short arrow */
printf("arrow from 1th circle - (0.5, 0) to 1th circle .w\n");
/* For each node ... */
for (i = 1; i < tc; i++)
{
t = tries[i];
if (!t) break;
if (t->links) {
/* ... draw all its epsilon nodes as
dashed, arrow-tailed arcs, or as
straight dashed lines
if we are going backwards
to the previous node
(arbitrary graphics decision) */
for (j = 0; j < t->links; j++) {
no = index_in_list(t->link[j]);
if (i < no) {
printf("arc -> dashed ");
printf("ccw ");
printf("from %dth circle .s to %dth circle .s chop\n",
i, no);
}
else if (i - no == 1)
printf("arrow dashed from %dth circle to %dth circle chop\n",
i, no);
else {
printf("arc -> dashed ");
printf("cw ");
printf("from %dth circle .s to %dth circle .s chop\n",
i, no);
}
}
}
/* Clear up the alphabet edges list */
for (j = 0; j < 1024; j++) {
refs[j].lc = 0;
strcpy(refs[j].links, "");
}
/* Add caret edges to the list.
They get the caption "start". */
if (t->caret) {
no = index_in_list(t->caret);
if (refs[no].lc++)
strcat(refs[no].links, ", ");
strcat(refs[no].links, "start");
}
/* Here, it gets a bit ugly. I check to
see if all the characters edges exist
and all point to the same node.
If so, we're dealing with a '.', and
it's wise to draw that specially.
*/
chk = 1;
tt = t->map[0];
for (j = 0; j < 256; j++)
if (t->map[j] != tt || !t->map[j])
{
chk = 0;
break;
}
if (chk) {
/* yes, it's the '.' special case */
no = index_in_list(t->map[0]);
if (refs[no].lc++)
strcat(refs[no].links, ", ");
strcat(refs[no].links, ".");
} else /* no, add all possible characters to their
destination-nodes' character lists */
for (j = 0; j < 256; j++)
if (t->map[j]) {
no = index_in_list(t->map[j]);
if (!isprint(j))
sprintf(buf2, "0x%x", j);
else
sprintf(buf2, "%c", j);
if (refs[no].lc++)
strcat(refs[no].links, ", ");
strcat(refs[no].links, buf2);
}
/* Draw the alphabet edges with their
character-list captions */
for (j = 1; j < 1024; j++) {
if (refs[j].lc)
{
printf("arc -> ");
if (i > j)
printf("ccw");
else
printf("cw");
printf(" from %dth circle .n to %dth circle .n chop\n",
i, j);
printf("\"%s\" at last arc + (0, 0.4)\n",
refs[j].links);
}
}
}
printf(".PE\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment