bl0ckeduser · August 26, 2012 21:36 · bl0ckeduser · Aug 27, 2012
diff --git a/wannabe-regex.c b/wannabe-regex.c
 /*
 *  August 26, 2012 error fixes:
 *
 *  Fixed * compilation to work properly with
 *  proper epsilon nodes. Fixed evaluator infinite
 *  loop check to work well with this.
 *  Removed large stack allocation in main().
 */

 /*
 * Wannabe regex (egrep-style)
 * Wannabe automatic regex tokenizer
 * Wannabe egrep command (./a.out --wgr 'pattern')
 * 
 * The wannabe egrep can make a picture of the compiled NFA
 * this way:
 *   ./a.out --wgr 'pattern' --pic | pic | troff -ms
 *
 * Supported 'special' characters:
 *	? + * . ^ $ \
 * the backslash is to escape special status.
 * For their exact meaning, see the compiler
 * routine, add_token().
 * 
 * (troff output has to be converted to some
 *  useful format: e.g. on my computer I pipe troff
 *  to "grops" [provided by free package groff]
 *  to get postscript)
 *
 * Now with support for multiple epsilon edges
 * as well as lazy/greedy/minimalist choice-mode
 * choice. (Yeah, you get to choose how the program
 * chooses). Please note that it uses NFAs, which
 * are inferior to DFAs. Nasty backtracking involved.
 * There's some algorithm with matrices and stuff to
 * convert, but that'll be for another time. I'm not
 * even a CS student or professional or teacher or
 * anything, I'm a kid, give me a little time.
 *
 * One day I should try to rewrite this in Lisp.
 *
 * To respect a certain idiom, greedy is the default
 * choice-mode. Minimalist mode tries to get as few
 * characters as possible, while lazy just gives you
 * the first thing it finds. 
 * (Are they actually the same ?)
 * 
 * Bl0ckeduser
 * August 2012
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 int VERBOSE = 0;

 /*
 *	Some literature
 *
 * "Each state of a nondeterministic automaton contains a list of `Accept'
 * values, a list of epsilon transitions (an epsilon transition represents a
 * transition to another state that can be made without reading a character)
 * and a list of transitions qualified with a character predicate (the
 * transition can only be made to the given state on input of a character
 * permitted by the predicate).  Although a list of `Accept' values is provided
 * for, in actual fact each state will have zero or one of them (the `Maybe'
 * type is not used because the flexibility offered by the list representation
 * is useful)."
 *  Source: NFA.hs, part of Alex - (c) Chris Dornan 1995-2000, Simon Marlow 2003
 *  http://hackage.haskell.org/packages/archive/alex-meta/latest/doc/html/src/NFA.html
 */

 /*
 *	Programming notes / to-do's
 *
 * - What does '.+?' mean ? GNU egrep seems to do it
 *   differently. Is it BS ? Or am I mis-compiling ?
 *   Should I check for stuff that makes no sense ?
 *   Is there a systematic, clean way to do this ?
 *
 * - NFAs / backtracking are said to be a poor way to
 *   do regex. DFAs are said to be better. DFA next
 *   time, I guess. (As restated in header above).
 */

 /* 
 * This is what gets returned when
 * a string is found to match a 
 * regex pattern. The data therein
 * is mostly useful for a tokenizer
 * wanting the length of the match,
 * since the wannabe egrep only needs to
 * know whether match of its single loaded
 * pattern succeded. If token == 0,
 * no match was made.
 */
 typedef struct match_struct {
 	/* Accept number.*/
 	int token;
 	
 	/* Last character in the match. */
 	char* pos;
 } match_t;

 /*
 * This is the NFA node data structure.
 * It includes outgoing edges as pointers
 * to other structures, and an accept
 * number.
 */
 typedef struct trie_struct {
 	/*
 	 * The evaluator will try all these edges:
 	 *
 	 * 	- map[c] : standard character edge,
 	 *		   where c is the current 
 	 *		   character
 	 *
 	 *	- link[0], link[1], ..., link[links - 1]:
 	 *	  these are in fact epsilon edges
 	 *
 	 *	- caret (see below)
 	 * 
 	 * When several choices exist, the evaluator
 	 * will go through all possible paths and then
 	 * choose the "best" successful match according 
 	 * to the chosen choice-mode (greed/lazy/mini).
 	 */
 	struct trie_struct* map[256];
 	struct trie_struct** link;
 	int links;
 	int link_alloc;

 	/*
 	 * Special "character" edge for the
 	 * beggining of the string, created
 	 * by the '^' meta-character.
 	 */
 	struct trie_struct* caret;

 	/*
 	 * Evaluating a matching pattern eventually
 	 * leads to a node with valid_token set
 	 * to the a number associated with the originating
 	 * regex. Other nodes have this set to 0.
 	 */
 	int valid_token;
 } trie;

 void traverse(trie *);
 void printout(void);
 void pic_printout(void);
 void clear_trie_list(void);

 /* choice-modes */
 enum {
 	LAZY = 0,
 	MINIMALIST,
 	GREEDY
 };

 void smartass_putchar(int c)
 {
 	if (!isprint(c))
 		printf("[0x%x]", c);
 	else
 		putchar(c);
 }

 void fail(char* mesg)
 {
 	fprintf(stderr, "error: %s\n", mesg);
 	exit(1);
 }

 void sanity_requires(int exp)
 {
 	if(!exp)
 		fail("that doesn't make any sense");
 }

 void add_link(trie* from, trie* to)
 {
 	int i;

 	/* check for redundant link */
 	for (i = 0; i < from->links; i++)
 		if (from->link[i] == to)
 			return;

 	if (++from->links > from->link_alloc)
 		from->link = realloc(from->link, (from->link_alloc += 5) * sizeof(trie *));

 	if (!from->link)
 		fail("link array allocation");

 	from->link[from->links - 1] = to;
 }

 trie* new_trie()
 {
 	trie* t = malloc(sizeof(trie));
 	int i;

 	if (!t)
 		fail("trie allocation");

 	t->link = malloc(10 * sizeof(trie*));
 	if (!t->link)
 		fail("trie links");
 	t->links = 0;
 	t->link_alloc = 10;
 	for (i = 0; i < 10; i++)
 		t->link[i] = NULL;

 	t->valid_token = 0;
 	t->caret = NULL;

 	for(i = 0; i < 256; i++)
 		t->map[i] = NULL;

 	return t;
 }

 /*
 * Compiler routine.
 * 
 * Creates and adds the edges and nodes
 * for regex pattern 'tok' to the
 * starting node 't', as well as
 * an accept state identified by
 * number 'key'.
 */
 int add_token(trie* t, char* tok, int key)
 {
 	int i, j;
 	int k;
 	int c;
 	trie *hist[1024];
 	trie *next;
 	int len = strlen(tok) + 1;
 	char* new_tok;
 	int dollar = 0;
 	int n;

 	new_tok = malloc(len + 32);
 	if (!new_tok)
 		fail("buffer allocation");
 	strcpy(new_tok, tok);

 	/* Rewrite $ as terminating null */
 	for (i = 0; i < len; ++i) {
 		switch (new_tok[i]) {
 			case '\\':
 				if (!new_tok[++i])
 					fail("backslash expected char");
 				break;
 			case '$':
 				new_tok[i] = 0;
 				len = i + 1;
 				dollar = 1;
 		}
 	}

 	/* Otherwise, make the terminating null
 	 * optional to allow for sub-matches
 	 */
 	if (!dollar)
 		new_tok[len++] = '?';

 	for (i = 0, n = 0; c = new_tok[i], i < len && t; ++i)
 	{
 		/*
 		 * 'n' is node index, which is distinct
 		 * from source regex character index ('i') because
 		 * meta-characters do not create new nodes.
 		 *
 		 * Although matching traverses the tree in a 
 		 * messy, complicated fashion due to backtracking,
 		 * which would make a node index nonsensical for it,
 		 * compilation creates the nodes in a simple, linear
 		 * fashion (this loop) and has to refer to previous nodes
 		 * when compiling meta-characters.
 		 */
 		hist[n] = t;

 		switch (c) {
 			case '?':		/* optional character */
 				sanity_requires(n > 0);
 				add_link(hist[n - 1], t);
 				break;
 			case '+':		/* character can repeat
 						   any number of times */
 				sanity_requires(n > 0);
 				add_link(t, hist[n - 1]);
 				break;
 			case '*':		/* optional character
 						   with repetition allowed */
 				sanity_requires(n > 0);

 				add_link(t, hist[++n] = new_trie());
 				t = hist[n];
 				add_link(hist[n - 1], t);
 				
 				add_link(t, hist[n - 2]);
 				add_link(hist[n - 2], t);

 				add_link(t, hist[++n] = new_trie());
 				t = hist[n];
 				add_link(hist[n - 1], t);

 				break;
 			case '.':		/* any normal character */
 				next = new_trie();
 				for(j = 0; j < 256; j++)
 					t->map[j] = next;
 				t = next;
 				++n;
 				break;
 			case '^':		/* beggining of line */
 				if (!t->caret)
 					t->caret = new_trie();
 				t = t->caret;
 				++n;
 				break;
 			case '\\':		/* escape to normal text */
 				if (++i == len)
 					fail("backslash expected char");
 				c = new_tok[i];
 			default:		/* normal text */
 				if (!t->map[c])
 					t->map[c] = new_trie();
 				t = t->map[c];
 				++n;
 		}
 	}

 	free(new_tok);

 	/* 
 	 * We have reached the accept-state node;
 	 * mark it as such.
 	 */
 	t->valid_token = key;
 }


 match_t match_setup(trie* t)
 {
 	clear_trie_list();
 	traverse(t);
 }

 /*
 * NFA trie evaluator, with backtracking, 
 * implementation-time hacks, parameters and all
 * (aka bloat)
 */
 match_t match(trie* t, char* full_tok, char* tok, int btm, int vd, int mode, trie* led, int ledi)
 {
 	int i = 0;
 	int j;
 	char c;

 	/* choice according to mode */
 	int record;
 	match_t m;
 	match_t ml[10];
 	int mc = 0;
 	match_t* choice;

 	extern int trie_index(trie* t);

 	int len = strlen(tok) + 1;
 	int init = btm;

 	i = 0; do
 	{
 		c = tok[i];

 		/* 
 		 * Give some info on the current
 		 * character and node for debugging
 		 * purposes
 		 */
 		if (VERBOSE) {
 			if (init) {
 				printf("[fork %d:%d - c=", vd, init);
 				smartass_putchar(c);
 				printf(",\tn=%d] ", trie_index(t));
 			} else {
 				printf("At node %d; ", trie_index(t), t);	
 				printf("have ");
 				smartass_putchar(c);
 				printf(" (abs. index %d)\n", &tok[i] - full_tok);
 			}
 		}

 		/* Follow caret edge */
 		if (&tok[i] == full_tok && t->caret)
 			t = t->caret;

 		if (t->links && !btm)
 		{
 			/*
 			 * Ambiguous. Try all possibilites by "forking"
 			 * (informal sense).
 			 */

 			if (VERBOSE)
 				printf("Ambiguous fork; trying all edges\n");

 			/* Try character edge if it exists */
 			if (t->map[c])
 				if ((m = match(t, full_tok, tok + i, 1, vd + 1, mode, led, ledi)).token)
 					ml[mc++] = m;

 			/* Try all the epsilon nodes */
 			for (j = 0; j < t->links; j++)
 				if ((m = match(t, full_tok, tok + i, j + 2, vd + 1, mode, led, ledi)).token)
 					ml[mc++] = m;

 			/*
 			 * If there are multiple matches, we
 			 * choose based on the mode.
 			 */
 			if (mc > 1) {
 				switch (mode) {
 					/* Just give the first match */
 					case LAZY:
 						return *ml;

 					/* Give the shortest match */
 					case MINIMALIST:
 						record = strlen(full_tok) + 1;
 						choice = NULL;
 						for (j = 0; j < mc; j++)
 							if (ml[j].pos - full_tok <= record) {
 								choice = &ml[j];
 								record = ml[j].pos - full_tok;
 							}
 						if (!choice) fail("lazy choice");
 						return *choice;

 					/* Give the longest match */
 					case GREEDY:
 						record = 0;
 						choice = NULL;
 						for (j = 0; j < mc; j++)
 							if (ml[j].pos - full_tok >= record) {
 								choice = &ml[j];
 								record = ml[j].pos - full_tok;
 							}
 						if (!choice) fail("greedy choice");
 						return *choice;

 					default:
 						fail("match() call mode number");
 				}
 			}
 			else if (mc == 1) /* No ambiguity, no time wasted choosing */
 				return *ml;
 			else		/* For non-matches (they are never pushed
 					   to the decision list) */
 				return m;
 		}

 		if (btm <= 1 && t->map[c]) {
 			if (VERBOSE) {
 				printf("Following ");
 				smartass_putchar(c);
 				printf(" edge\n");
 			}

 			t = t->map[c];

 			/* 
 			 * Does this have to fork ???
 			 */
 			if (t->valid_token) break;

 			++i;
 			goto valid;
 		}
 		else if (btm > 1) {

 			if (VERBOSE)
 				printf("Trying to follow epsilon edge %d\n", btm - 2 + 1);

 			/* prevents infinite loops */
 			if (!(led && t->link[btm - 2] == led
 			    && (int)(&tok[i] - full_tok) == ledi)) {
 				led = t;	/* Last Epsilon Departure
 						   node */
 				ledi = (int)(&tok[i] - full_tok);
 				t = t->link[btm - 2];
 				goto valid;
 			} else {
 				if (VERBOSE)
 					printf("... won't, I just was there.\n");
 			}
 		}

 		break;

 		/* The fork path choice is only
 		 * relevant for the first ambiguous
 		 * choice after the fork
 		 */
 valid:		if (btm)
 			btm = 0;

 	} while (i < len && t);

 	if (t && t->valid_token) {
 		m.token = t->valid_token;
 		m.pos = tok + i;
 		if (VERBOSE)
 			printf("Matched !\n");
 	} else {
 		m.token = 0;
 		if (VERBOSE)
 			printf("No good\n");
 	}

 	return m;
 }

 char* theregex;

 int main(int argc, char** argv)
 {
 	trie* t = new_trie();
 	match_t m;

 	/* tokenizer shell */
 	char buf[1024];
 	char cmd[1024];
 	char nam[1024];
 	char** expnames = NULL;
 	int i = 0, j =0;
 	int mod = GREEDY;	/* a Un*x idiom, innit ? */

 	/* wannabe-egrep */
 	int wgr = 0;
 	char *p;

 	if (!(expnames = malloc(1024 * sizeof(char *))))
 		fail("malloc");

 	for (i = 0; i < 1024; i++)
 		if (!(expnames[i] = malloc(1024)))
 			fail("malloc");


 	if (argc > 1 && !strcmp(argv[1], "--wgr")) {
 		/* wannabe-egrep mode */
 		wgr = 1;
 		if (argc == 2)
 			add_token(t, theregex = "", 1);
 		else
 			add_token(t, theregex = argv[2], 1);
 	}

 	for (i = 0; i < argc; i++)
 		if (!strcmp(argv[i], "--show"))
 		{
 			traverse(t);
 			printout();
 			return;
 		} else if (!strcmp(argv[i], "--pic"))
 		{
 			traverse(t);
 			pic_printout();
 			return;
 		}

 	strcpy(*expnames, "(failure)");

 	while (1) {
 		if(!wgr) {
 			printf("%c ", '%');
 			fflush(stdout);
 		}

 		fgets(buf, 1024, stdin);
 		if (feof(stdin)) break;

 		/* the meat of the wannabe-egrep */
 		if (wgr) {

 			/* truncate newline */
 			for (p = buf; *p; ++p)
 				if (*p == '\n')
 					*p = 0;

 			/* 
 			 * Traverse the tree and assign pretty
 			 * numbers to all encountered node pointers
 			 * to make debugging less painful.
 			 * (You get to hear about "node 2" instead of
 			 * "node pointed to by 0x129235". This scheme
 			 * is also used by the picture mode, where
 			 * you would e.g. see a circle with the number 2).
 			 */
 			match_setup(t);

 			/* 
 			 * The loop pushes forward the
 			 * substring index after every
 			 * failed match attempt.
 			 */
 			p = buf;
 			do {
 				if (VERBOSE && p != buf)
 					printf("\n--------\n"
 					       "trying submatch - index %d\n", 
 					       (int)(p - buf));

 				if (match(t, buf, p, 0, 0, GREEDY, NULL, 0).token) {
 					/* The line matches, so print it */
 					puts(buf);
 					break;
 				}

 			} while (*p++);

 		} else {
 			/* regex tokenizer shell mode */

 			*nam = *cmd = 0;
 			if(sscanf(buf, "%s %s", cmd, nam) <= 0) {
 				puts("sorry, I need a command and an argument");
 				continue;
 			}
 			/* add a new regex token */
 			if(!strcmp(cmd, "add")) {
 				add_token(t, nam, ++i);
 				strcpy(expnames[i], nam);
 				printf("ok\n", i);
 			}
 			/* set choice-mode */
 			else if(!strcmp(cmd, "mode")) {
 				if (!strcmp(nam, "lazy"))
 					mod = LAZY;
 				else if (!strcmp(nam, "min"))
 					mod = MINIMALIST;
 				else if (!strcmp(nam, "greedy"))
 					mod = GREEDY;
 				else if (!*nam || *nam=='?')
 					puts(mod == LAZY ? "LAZY" :
 						mod == GREEDY ? "GREEDY":
 							"META_LAZY");
 				else puts("???");
 			}
 			/* flip verbosity */
 			else if(!strcmp(cmd, "verbose")) {
 				VERBOSE = !!!VERBOSE;
 			}
 			/* try to match a string */
 			else if(!strcmp(cmd, "match")) {
 				if (i == 1) {
 					printf("no matching patterns installed\n");
 					continue;
 				}
 				match_setup(t);
 				m = match(t, nam, nam, 0, 0, mod, NULL, 0);
 				if (m.token == 0) {
 					puts("no match");
 					continue;
 				}
 				printf("pattern '%s': ", expnames[m.token]);
 				putchar('\'');
 				for (j = 0; j < (int)(m.pos - nam); j++)
 					putchar(nam[j]); 
 				putchar('\''); putchar('\n');
 			}
 			else printf(" ??? \n");
 		}
 	}

 	if(!wgr)
 		putchar('\n');

 	return 0;
 }

 /* Down here we have the
 * pretty-numbers-for-pointers code
 * and routines that display either
 * simple text representation of a compiled
 * tree or a generate pic source for 
 * a nice picture of the tree (with arrows,
 * circles, and all).
 */

 trie* tries[1024] = {NULL};
 int tc = 1;

 int trie_index(trie* t)
 {
 	int i;
 	for (i = 1; i < tc; i++)
 		if (tries[i] == t)
 			return i;
 	return 0;
 }

 int trie_in_list(trie* t)
 {
 	int i;
 	for (i = 1; i < tc; i++)
 		if (tries[i] == t)
 			return 1;
 	return 0;
 }

 void clear_trie_list(void)
 {
 	tc = 1;
 }

 /* recursively look for all existing
 * tree-nodes and assign unique pretty
 * natural numbers to them, for
 * human reader / picture viewer 
 * convenience.
 */
 void traverse(trie* t)
 {
 	int i, j;
 	int e;

 	if (!trie_in_list(t))
 		tries[tc++] = t;

 	if (t->caret && !trie_in_list(t->caret))
 		traverse(t->caret);

 	for (i = 0; i < 256; i++)
 		if (t->map[i] && !trie_in_list(t->map[i])) {
 			tries[tc++] = t->map[i];
 			traverse(t->map[i]);
 		}

 	for (i = 0; i < t->links; i++)
 		if (!trie_in_list(t->link[i]))
 			traverse(t->link[i]);
 }

 /* pointer -> pretty number */
 int index_in_list(trie* t)
 {
 	int i;
 	for (i = 1; i < tc; i++)
 		if (tries[i] == t)
 			return i;
 	fail("trie list lookup");
 }

 void printout()		/* text representation of trie */
 {
 	int i, j;
 	int chk;
 	trie* tt;
 	trie* t;

 	for (i = 1; i < tc; i++)
 	{
 		printf("%d. ", i);
 		t = tries[i];
 		if (!t) break;
 		if (t->links)	/* [epsilon edge destination node number] */
 			for (j = 0; j < t->links; j++)
 				printf (" [%d] ", index_in_list(t->link[j]));
 		if (t->valid_token)	/* ::accept-state number */
 			printf(" ::%d ", t->valid_token);
 		if (t->caret)		/* caret alphabet edge -> dest. node num. */
 			printf(" caret->%d", index_in_list(t->caret));

 		chk = 1;
 		tt = t->map[0];
 		for (j = 0; j < 256; j++)
 			if (t->map[j] != tt || !t->map[j])
 			{
 				chk = 0;
 				break;
 			}

 		if (chk) {	/* all alphabet edges -> dest node num. */
 			printf(" .-> %d ", index_in_list(t->map[0]));
 		} else		/* specific characters -> dest nonde nums. */
 			for (j = 0; j < 256; j++)
 				if (t->map[j])
 					printf(" %c->%d ", j, index_in_list(t->map[j]));
 		putchar('\n');
 	}
 }

 /* makes pic source for a picture of the tree */
 void pic_printout()
 {
 	/* registers, buffers, herp derp derp */
 	int i, j;
 	int chk;
 	trie* tt;
 	trie* t;
 	int cc;
 	int no;
 	char buf[1024], buf2[1024];

 	struct {
 		char links[1024];	/* caption listing
 					   the characters that
 					   share this edge */

 		int lc;			/* (count of above) */

 	} refs[1024];	/* list of alphabet edges for this node,
 			   indexed by destination node. */

 	puts(".PS");

 	/* Write out the originating regex pattern
 	   and then move away to make space for the
 	   actual picture */
 	printf("\"%s\"\n", theregex);
 	puts("down; move 1.0; right");

 	/* Draw circles for all of the
 	   nodes */
 	for (i = 1; i < tc; i++)
 		printf("circle \"%d\"; move 0.75\n", i);

 	/* Find the accept node and draw a
 	   nested concentric circle there */
 	for (i = 1; i < tc; i++)
 		if (tries[i]->valid_token)
 			printf("circle rad 0.2 at %dth circle\n", i);

 	/* Mark the starting node with a short arrow */
 	printf("arrow from 1th circle - (0.5, 0) to 1th circle .w\n");

 	/* For each node ... */
 	for (i = 1; i < tc; i++)
 	{
 		t = tries[i];
 		if (!t) break;
 		if (t->links) {
 			/* ... draw all its epsilon nodes as
 			   dashed, arrow-tailed arcs, or as
 			   straight dashed lines 
 			   if we are going backwards
 			   to the previous node 
 			   (arbitrary graphics decision) */
 			for (j = 0; j < t->links; j++) {
 				no = index_in_list(t->link[j]);
 				if (i < no) {
 					printf("arc -> dashed ");
 					printf("ccw ");
 					printf("from %dth circle .s to %dth circle .s chop\n",
 						i, no);
 				}
 				else if (i - no == 1)
 					printf("arrow dashed from %dth circle to %dth circle chop\n",
 						i, no);
 				else {
 					printf("arc -> dashed ");
 					printf("cw ");
 					printf("from %dth circle .s to %dth circle .s chop\n",
 						i, no);
 				}
 			}
 		}

 		/* Clear up the alphabet edges list */
 		for (j = 0; j < 1024; j++) {
 			refs[j].lc = 0;
 			strcpy(refs[j].links, "");
 		}

 		/* Add caret edges to the list.
 		   They get the caption "start". */
 		if (t->caret) {
 			no = index_in_list(t->caret);
 			if (refs[no].lc++)
 				strcat(refs[no].links, ", ");
 			strcat(refs[no].links, "start");
 		}

 		/* Here, it gets a bit ugly. I check to
 		   see if all the characters edges exist
 		   and all point to the same node.
 		   If so, we're dealing with a '.', and
 		   it's wise to draw that specially.
 		*/
 		chk = 1;
 		tt = t->map[0];
 		for (j = 0; j < 256; j++)
 			if (t->map[j] != tt || !t->map[j])
 			{
 				chk = 0;
 				break;
 			}

 		if (chk) {
 			/* yes, it's the '.' special case */
 			no = index_in_list(t->map[0]);
 			if (refs[no].lc++)
 				strcat(refs[no].links, ", ");
 			strcat(refs[no].links, ".");

 		} else	/* no, add all possible characters to their
 			   destination-nodes' character lists */
 			for (j = 0; j < 256; j++)
 				if (t->map[j]) {
 					no = index_in_list(t->map[j]);
 					if (!isprint(j))
 						sprintf(buf2, "0x%x", j);
 					else
 						sprintf(buf2, "%c", j);
 					if (refs[no].lc++)
 						strcat(refs[no].links, ", ");
 					strcat(refs[no].links, buf2);
 				}

 		/* Draw the alphabet edges with their
 		   character-list captions */
 		for (j = 1; j < 1024; j++) {
 			if (refs[j].lc)
 			{
 				printf("arc -> ");
 				if (i > j)
 					printf("ccw");
 				else
 					printf("cw");
 				printf(" from %dth circle .n to %dth circle .n chop\n",
 					i, j);
 				printf("\"%s\" at last arc + (0, 0.4)\n",
 					refs[j].links);
 			}
 		}
 	}

 	printf(".PE\n");
 }
	/*
	* August 26, 2012 error fixes:
	*
	* Fixed * compilation to work properly with
	* proper epsilon nodes. Fixed evaluator infinite
	* loop check to work well with this.
	* Removed large stack allocation in main().
	*/

	/*
	* Wannabe regex (egrep-style)
	* Wannabe automatic regex tokenizer
	* Wannabe egrep command (./a.out --wgr 'pattern')
	*
	* The wannabe egrep can make a picture of the compiled NFA
	* this way:
	* ./a.out --wgr 'pattern' --pic \| pic \| troff -ms
	*
	* Supported 'special' characters:
	* ? + * . ^ $ \
	* the backslash is to escape special status.
	* For their exact meaning, see the compiler
	* routine, add_token().
	*
	* (troff output has to be converted to some
	* useful format: e.g. on my computer I pipe troff
	* to "grops" [provided by free package groff]
	* to get postscript)
	*
	* Now with support for multiple epsilon edges
	* as well as lazy/greedy/minimalist choice-mode
	* choice. (Yeah, you get to choose how the program
	* chooses). Please note that it uses NFAs, which
	* are inferior to DFAs. Nasty backtracking involved.
	* There's some algorithm with matrices and stuff to
	* convert, but that'll be for another time. I'm not
	* even a CS student or professional or teacher or
	* anything, I'm a kid, give me a little time.
	*
	* One day I should try to rewrite this in Lisp.
	*
	* To respect a certain idiom, greedy is the default
	* choice-mode. Minimalist mode tries to get as few
	* characters as possible, while lazy just gives you
	* the first thing it finds.
	* (Are they actually the same ?)
	*
	* Bl0ckeduser
	* August 2012
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	int VERBOSE = 0;

	/*
	* Some literature
	*
	* "Each state of a nondeterministic automaton contains a list of `Accept'
	* values, a list of epsilon transitions (an epsilon transition represents a
	* transition to another state that can be made without reading a character)
	* and a list of transitions qualified with a character predicate (the
	* transition can only be made to the given state on input of a character
	* permitted by the predicate). Although a list of `Accept' values is provided
	* for, in actual fact each state will have zero or one of them (the `Maybe'
	* type is not used because the flexibility offered by the list representation
	* is useful)."
	* Source: NFA.hs, part of Alex - (c) Chris Dornan 1995-2000, Simon Marlow 2003
	* http://hackage.haskell.org/packages/archive/alex-meta/latest/doc/html/src/NFA.html
	*/

	/*
	* Programming notes / to-do's
	*
	* - What does '.+?' mean ? GNU egrep seems to do it
	* differently. Is it BS ? Or am I mis-compiling ?
	* Should I check for stuff that makes no sense ?
	* Is there a systematic, clean way to do this ?
	*
	* - NFAs / backtracking are said to be a poor way to
	* do regex. DFAs are said to be better. DFA next
	* time, I guess. (As restated in header above).
	*/

	/*
	* This is what gets returned when
	* a string is found to match a
	* regex pattern. The data therein
	* is mostly useful for a tokenizer
	* wanting the length of the match,
	* since the wannabe egrep only needs to
	* know whether match of its single loaded
	* pattern succeded. If token == 0,
	* no match was made.
	*/
	typedef struct match_struct {
	/* Accept number.*/
	int token;

	/* Last character in the match. */
	char* pos;
	} match_t;

	/*
	* This is the NFA node data structure.
	* It includes outgoing edges as pointers
	* to other structures, and an accept
	* number.
	*/
	typedef struct trie_struct {
	/*
	* The evaluator will try all these edges:
	*
	* - map[c] : standard character edge,
	* where c is the current
	* character
	*
	* - link[0], link[1], ..., link[links - 1]:
	* these are in fact epsilon edges
	*
	* - caret (see below)
	*
	* When several choices exist, the evaluator
	* will go through all possible paths and then
	* choose the "best" successful match according
	* to the chosen choice-mode (greed/lazy/mini).
	*/
	struct trie_struct* map[256];
	struct trie_struct** link;
	int links;
	int link_alloc;

	/*
	* Special "character" edge for the
	* beggining of the string, created
	* by the '^' meta-character.
	*/
	struct trie_struct* caret;

	/*
	* Evaluating a matching pattern eventually
	* leads to a node with valid_token set
	* to the a number associated with the originating
	* regex. Other nodes have this set to 0.
	*/
	int valid_token;
	} trie;

	void traverse(trie *);
	void printout(void);
	void pic_printout(void);
	void clear_trie_list(void);

	/* choice-modes */
	enum {
	LAZY = 0,
	MINIMALIST,
	GREEDY
	};

	void smartass_putchar(int c)
	{
	if (!isprint(c))
	printf("[0x%x]", c);
	else
	putchar(c);
	}

	void fail(char* mesg)
	{
	fprintf(stderr, "error: %s\n", mesg);
	exit(1);
	}

	void sanity_requires(int exp)
	{
	if(!exp)
	fail("that doesn't make any sense");
	}

	void add_link(trie* from, trie* to)
	{
	int i;

	/* check for redundant link */
	for (i = 0; i < from->links; i++)
	if (from->link[i] == to)
	return;

	if (++from->links > from->link_alloc)
	from->link = realloc(from->link, (from->link_alloc += 5) * sizeof(trie *));

	if (!from->link)
	fail("link array allocation");

	from->link[from->links - 1] = to;
	}

	trie* new_trie()
	{
	trie* t = malloc(sizeof(trie));
	int i;

	if (!t)
	fail("trie allocation");

	t->link = malloc(10 * sizeof(trie*));
	if (!t->link)
	fail("trie links");
	t->links = 0;
	t->link_alloc = 10;
	for (i = 0; i < 10; i++)
	t->link[i] = NULL;

	t->valid_token = 0;
	t->caret = NULL;

	for(i = 0; i < 256; i++)
	t->map[i] = NULL;

	return t;
	}

	/*
	* Compiler routine.
	*
	* Creates and adds the edges and nodes
	* for regex pattern 'tok' to the
	* starting node 't', as well as
	* an accept state identified by
	* number 'key'.
	*/
	int add_token(trie* t, char* tok, int key)
	{
	int i, j;
	int k;
	int c;
	trie *hist[1024];
	trie *next;
	int len = strlen(tok) + 1;
	char* new_tok;
	int dollar = 0;
	int n;

	new_tok = malloc(len + 32);
	if (!new_tok)
	fail("buffer allocation");
	strcpy(new_tok, tok);

	/* Rewrite $ as terminating null */
	for (i = 0; i < len; ++i) {
	switch (new_tok[i]) {
	case '\\':
	if (!new_tok[++i])
	fail("backslash expected char");
	break;
	case '$':
	new_tok[i] = 0;
	len = i + 1;
	dollar = 1;
	}
	}

	/* Otherwise, make the terminating null
	* optional to allow for sub-matches
	*/
	if (!dollar)
	new_tok[len++] = '?';

	for (i = 0, n = 0; c = new_tok[i], i < len && t; ++i)
	{
	/*
	* 'n' is node index, which is distinct
	* from source regex character index ('i') because
	* meta-characters do not create new nodes.
	*
	* Although matching traverses the tree in a
	* messy, complicated fashion due to backtracking,
	* which would make a node index nonsensical for it,
	* compilation creates the nodes in a simple, linear
	* fashion (this loop) and has to refer to previous nodes
	* when compiling meta-characters.
	*/
	hist[n] = t;

	switch (c) {
	case '?': /* optional character */
	sanity_requires(n > 0);
	add_link(hist[n - 1], t);
	break;
	case '+': /* character can repeat
	any number of times */
	sanity_requires(n > 0);
	add_link(t, hist[n - 1]);
	break;
	case '': / optional character
	with repetition allowed */
	sanity_requires(n > 0);

	add_link(t, hist[++n] = new_trie());
	t = hist[n];
	add_link(hist[n - 1], t);

	add_link(t, hist[n - 2]);
	add_link(hist[n - 2], t);

	add_link(t, hist[++n] = new_trie());
	t = hist[n];
	add_link(hist[n - 1], t);

	break;
	case '.': /* any normal character */
	next = new_trie();
	for(j = 0; j < 256; j++)
	t->map[j] = next;
	t = next;
	++n;
	break;
	case '^': /* beggining of line */
	if (!t->caret)
	t->caret = new_trie();
	t = t->caret;
	++n;
	break;
	case '\\': /* escape to normal text */
	if (++i == len)
	fail("backslash expected char");
	c = new_tok[i];
	default: /* normal text */
	if (!t->map[c])
	t->map[c] = new_trie();
	t = t->map[c];
	++n;
	}
	}

	free(new_tok);

	/*
	* We have reached the accept-state node;
	* mark it as such.
	*/
	t->valid_token = key;
	}


	match_t match_setup(trie* t)
	{
	clear_trie_list();
	traverse(t);
	}

	/*
	* NFA trie evaluator, with backtracking,
	* implementation-time hacks, parameters and all
	* (aka bloat)
	*/
	match_t match(trie* t, char* full_tok, char* tok, int btm, int vd, int mode, trie* led, int ledi)
	{
	int i = 0;
	int j;
	char c;

	/* choice according to mode */
	int record;
	match_t m;
	match_t ml[10];
	int mc = 0;
	match_t* choice;

	extern int trie_index(trie* t);

	int len = strlen(tok) + 1;
	int init = btm;

	i = 0; do
	{
	c = tok[i];

	/*
	* Give some info on the current
	* character and node for debugging
	* purposes
	*/
	if (VERBOSE) {
	if (init) {
	printf("[fork %d:%d - c=", vd, init);
	smartass_putchar(c);
	printf(",\tn=%d] ", trie_index(t));
	} else {
	printf("At node %d; ", trie_index(t), t);
	printf("have ");
	smartass_putchar(c);
	printf(" (abs. index %d)\n", &tok[i] - full_tok);
	}
	}

	/* Follow caret edge */
	if (&tok[i] == full_tok && t->caret)
	t = t->caret;

	if (t->links && !btm)
	{
	/*
	* Ambiguous. Try all possibilites by "forking"
	* (informal sense).
	*/

	if (VERBOSE)
	printf("Ambiguous fork; trying all edges\n");

	/* Try character edge if it exists */
	if (t->map[c])
	if ((m = match(t, full_tok, tok + i, 1, vd + 1, mode, led, ledi)).token)
	ml[mc++] = m;

	/* Try all the epsilon nodes */
	for (j = 0; j < t->links; j++)
	if ((m = match(t, full_tok, tok + i, j + 2, vd + 1, mode, led, ledi)).token)
	ml[mc++] = m;

	/*
	* If there are multiple matches, we
	* choose based on the mode.
	*/
	if (mc > 1) {
	switch (mode) {
	/* Just give the first match */
	case LAZY:
	return *ml;

	/* Give the shortest match */
	case MINIMALIST:
	record = strlen(full_tok) + 1;
	choice = NULL;
	for (j = 0; j < mc; j++)
	if (ml[j].pos - full_tok <= record) {
	choice = &ml[j];
	record = ml[j].pos - full_tok;
	}
	if (!choice) fail("lazy choice");
	return *choice;

	/* Give the longest match */
	case GREEDY:
	record = 0;
	choice = NULL;
	for (j = 0; j < mc; j++)
	if (ml[j].pos - full_tok >= record) {
	choice = &ml[j];
	record = ml[j].pos - full_tok;
	}
	if (!choice) fail("greedy choice");
	return *choice;

	default:
	fail("match() call mode number");
	}
	}
	else if (mc == 1) /* No ambiguity, no time wasted choosing */
	return *ml;
	else /* For non-matches (they are never pushed
	to the decision list) */
	return m;
	}

	if (btm <= 1 && t->map[c]) {
	if (VERBOSE) {
	printf("Following ");
	smartass_putchar(c);
	printf(" edge\n");
	}

	t = t->map[c];

	/*
	* Does this have to fork ???
	*/
	if (t->valid_token) break;

	++i;
	goto valid;
	}
	else if (btm > 1) {

	if (VERBOSE)
	printf("Trying to follow epsilon edge %d\n", btm - 2 + 1);

	/* prevents infinite loops */
	if (!(led && t->link[btm - 2] == led
	&& (int)(&tok[i] - full_tok) == ledi)) {
	led = t; /* Last Epsilon Departure
	node */
	ledi = (int)(&tok[i] - full_tok);
	t = t->link[btm - 2];
	goto valid;
	} else {
	if (VERBOSE)
	printf("... won't, I just was there.\n");
	}
	}

	break;

	/* The fork path choice is only
	* relevant for the first ambiguous
	* choice after the fork
	*/
	valid: if (btm)
	btm = 0;

	} while (i < len && t);

	if (t && t->valid_token) {
	m.token = t->valid_token;
	m.pos = tok + i;
	if (VERBOSE)
	printf("Matched !\n");
	} else {
	m.token = 0;
	if (VERBOSE)
	printf("No good\n");
	}

	return m;
	}

	char* theregex;

	int main(int argc, char** argv)
	{
	trie* t = new_trie();
	match_t m;

	/* tokenizer shell */
	char buf[1024];
	char cmd[1024];
	char nam[1024];
	char** expnames = NULL;
	int i = 0, j =0;
	int mod = GREEDY; /* a Unx idiom, innit ? /

	/* wannabe-egrep */
	int wgr = 0;
	char *p;

	if (!(expnames = malloc(1024 * sizeof(char *))))
	fail("malloc");

	for (i = 0; i < 1024; i++)
	if (!(expnames[i] = malloc(1024)))
	fail("malloc");


	if (argc > 1 && !strcmp(argv[1], "--wgr")) {
	/* wannabe-egrep mode */
	wgr = 1;
	if (argc == 2)
	add_token(t, theregex = "", 1);
	else
	add_token(t, theregex = argv[2], 1);
	}

	for (i = 0; i < argc; i++)
	if (!strcmp(argv[i], "--show"))
	{
	traverse(t);
	printout();
	return;
	} else if (!strcmp(argv[i], "--pic"))
	{
	traverse(t);
	pic_printout();
	return;
	}

	strcpy(*expnames, "(failure)");

	while (1) {
	if(!wgr) {
	printf("%c ", '%');
	fflush(stdout);
	}

	fgets(buf, 1024, stdin);
	if (feof(stdin)) break;

	/* the meat of the wannabe-egrep */
	if (wgr) {

	/* truncate newline */
	for (p = buf; *p; ++p)
	if (*p == '\n')
	*p = 0;

	/*
	* Traverse the tree and assign pretty
	* numbers to all encountered node pointers
	* to make debugging less painful.
	* (You get to hear about "node 2" instead of
	* "node pointed to by 0x129235". This scheme
	* is also used by the picture mode, where
	* you would e.g. see a circle with the number 2).
	*/
	match_setup(t);

	/*
	* The loop pushes forward the
	* substring index after every
	* failed match attempt.
	*/
	p = buf;
	do {
	if (VERBOSE && p != buf)
	printf("\n--------\n"
	"trying submatch - index %d\n",
	(int)(p - buf));

	if (match(t, buf, p, 0, 0, GREEDY, NULL, 0).token) {
	/* The line matches, so print it */
	puts(buf);
	break;
	}

	} while (*p++);

	} else {
	/* regex tokenizer shell mode */

	nam = cmd = 0;
	if(sscanf(buf, "%s %s", cmd, nam) <= 0) {
	puts("sorry, I need a command and an argument");
	continue;
	}
	/* add a new regex token */
	if(!strcmp(cmd, "add")) {
	add_token(t, nam, ++i);
	strcpy(expnames[i], nam);
	printf("ok\n", i);
	}
	/* set choice-mode */
	else if(!strcmp(cmd, "mode")) {
	if (!strcmp(nam, "lazy"))
	mod = LAZY;
	else if (!strcmp(nam, "min"))
	mod = MINIMALIST;
	else if (!strcmp(nam, "greedy"))
	mod = GREEDY;
	else if (!nam \|\| nam=='?')
	puts(mod == LAZY ? "LAZY" :
	mod == GREEDY ? "GREEDY":
	"META_LAZY");
	else puts("???");
	}
	/* flip verbosity */
	else if(!strcmp(cmd, "verbose")) {
	VERBOSE = !!!VERBOSE;
	}
	/* try to match a string */
	else if(!strcmp(cmd, "match")) {
	if (i == 1) {
	printf("no matching patterns installed\n");
	continue;
	}
	match_setup(t);
	m = match(t, nam, nam, 0, 0, mod, NULL, 0);
	if (m.token == 0) {
	puts("no match");
	continue;
	}
	printf("pattern '%s': ", expnames[m.token]);
	putchar('\'');
	for (j = 0; j < (int)(m.pos - nam); j++)
	putchar(nam[j]);
	putchar('\''); putchar('\n');
	}
	else printf(" ??? \n");
	}
	}

	if(!wgr)
	putchar('\n');

	return 0;
	}

	/* Down here we have the
	* pretty-numbers-for-pointers code
	* and routines that display either
	* simple text representation of a compiled
	* tree or a generate pic source for
	* a nice picture of the tree (with arrows,
	* circles, and all).
	*/

	trie* tries[1024] = {NULL};
	int tc = 1;

	int trie_index(trie* t)
	{
	int i;
	for (i = 1; i < tc; i++)
	if (tries[i] == t)
	return i;
	return 0;
	}

	int trie_in_list(trie* t)
	{
	int i;
	for (i = 1; i < tc; i++)
	if (tries[i] == t)
	return 1;
	return 0;
	}

	void clear_trie_list(void)
	{
	tc = 1;
	}

	/* recursively look for all existing
	* tree-nodes and assign unique pretty
	* natural numbers to them, for
	* human reader / picture viewer
	* convenience.
	*/
	void traverse(trie* t)
	{
	int i, j;
	int e;

	if (!trie_in_list(t))
	tries[tc++] = t;

	if (t->caret && !trie_in_list(t->caret))
	traverse(t->caret);

	for (i = 0; i < 256; i++)
	if (t->map[i] && !trie_in_list(t->map[i])) {
	tries[tc++] = t->map[i];
	traverse(t->map[i]);
	}

	for (i = 0; i < t->links; i++)
	if (!trie_in_list(t->link[i]))
	traverse(t->link[i]);
	}

	/* pointer -> pretty number */
	int index_in_list(trie* t)
	{
	int i;
	for (i = 1; i < tc; i++)
	if (tries[i] == t)
	return i;
	fail("trie list lookup");
	}

	void printout() /* text representation of trie */
	{
	int i, j;
	int chk;
	trie* tt;
	trie* t;

	for (i = 1; i < tc; i++)
	{
	printf("%d. ", i);
	t = tries[i];
	if (!t) break;
	if (t->links) /* [epsilon edge destination node number] */
	for (j = 0; j < t->links; j++)
	printf (" [%d] ", index_in_list(t->link[j]));
	if (t->valid_token) /* ::accept-state number */
	printf(" ::%d ", t->valid_token);
	if (t->caret) /* caret alphabet edge -> dest. node num. */
	printf(" caret->%d", index_in_list(t->caret));

	chk = 1;
	tt = t->map[0];
	for (j = 0; j < 256; j++)
	if (t->map[j] != tt \|\| !t->map[j])
	{
	chk = 0;
	break;
	}

	if (chk) { /* all alphabet edges -> dest node num. */
	printf(" .-> %d ", index_in_list(t->map[0]));
	} else /* specific characters -> dest nonde nums. */
	for (j = 0; j < 256; j++)
	if (t->map[j])
	printf(" %c->%d ", j, index_in_list(t->map[j]));
	putchar('\n');
	}
	}

	/* makes pic source for a picture of the tree */
	void pic_printout()
	{
	/* registers, buffers, herp derp derp */
	int i, j;
	int chk;
	trie* tt;
	trie* t;
	int cc;
	int no;
	char buf[1024], buf2[1024];

	struct {
	char links[1024]; /* caption listing
	the characters that
	share this edge */

	int lc; /* (count of above) */

	} refs[1024]; /* list of alphabet edges for this node,
	indexed by destination node. */

	puts(".PS");

	/* Write out the originating regex pattern
	and then move away to make space for the
	actual picture */
	printf("\"%s\"\n", theregex);
	puts("down; move 1.0; right");

	/* Draw circles for all of the
	nodes */
	for (i = 1; i < tc; i++)
	printf("circle \"%d\"; move 0.75\n", i);

	/* Find the accept node and draw a
	nested concentric circle there */
	for (i = 1; i < tc; i++)
	if (tries[i]->valid_token)
	printf("circle rad 0.2 at %dth circle\n", i);

	/* Mark the starting node with a short arrow */
	printf("arrow from 1th circle - (0.5, 0) to 1th circle .w\n");

	/* For each node ... */
	for (i = 1; i < tc; i++)
	{
	t = tries[i];
	if (!t) break;
	if (t->links) {
	/* ... draw all its epsilon nodes as
	dashed, arrow-tailed arcs, or as
	straight dashed lines
	if we are going backwards
	to the previous node
	(arbitrary graphics decision) */
	for (j = 0; j < t->links; j++) {
	no = index_in_list(t->link[j]);
	if (i < no) {
	printf("arc -> dashed ");
	printf("ccw ");
	printf("from %dth circle .s to %dth circle .s chop\n",
	i, no);
	}
	else if (i - no == 1)
	printf("arrow dashed from %dth circle to %dth circle chop\n",
	i, no);
	else {
	printf("arc -> dashed ");
	printf("cw ");
	printf("from %dth circle .s to %dth circle .s chop\n",
	i, no);
	}
	}
	}

	/* Clear up the alphabet edges list */
	for (j = 0; j < 1024; j++) {
	refs[j].lc = 0;
	strcpy(refs[j].links, "");
	}

	/* Add caret edges to the list.
	They get the caption "start". */
	if (t->caret) {
	no = index_in_list(t->caret);
	if (refs[no].lc++)
	strcat(refs[no].links, ", ");
	strcat(refs[no].links, "start");
	}

	/* Here, it gets a bit ugly. I check to
	see if all the characters edges exist
	and all point to the same node.
	If so, we're dealing with a '.', and
	it's wise to draw that specially.
	*/
	chk = 1;
	tt = t->map[0];
	for (j = 0; j < 256; j++)
	if (t->map[j] != tt \|\| !t->map[j])
	{
	chk = 0;
	break;
	}

	if (chk) {
	/* yes, it's the '.' special case */
	no = index_in_list(t->map[0]);
	if (refs[no].lc++)
	strcat(refs[no].links, ", ");
	strcat(refs[no].links, ".");

	} else /* no, add all possible characters to their
	destination-nodes' character lists */
	for (j = 0; j < 256; j++)
	if (t->map[j]) {
	no = index_in_list(t->map[j]);
	if (!isprint(j))
	sprintf(buf2, "0x%x", j);
	else
	sprintf(buf2, "%c", j);
	if (refs[no].lc++)
	strcat(refs[no].links, ", ");
	strcat(refs[no].links, buf2);
	}

	/* Draw the alphabet edges with their
	character-list captions */
	for (j = 1; j < 1024; j++) {
	if (refs[j].lc)
	{
	printf("arc -> ");
	if (i > j)
	printf("ccw");
	else
	printf("cw");
	printf(" from %dth circle .n to %dth circle .n chop\n",
	i, j);
	printf("\"%s\" at last arc + (0, 0.4)\n",
	refs[j].links);
	}
	}
	}

	printf(".PE\n");
	}