Skip to content

Instantly share code, notes, and snippets.

@tj
Created September 21, 2010 05:11
Show Gist options
  • Save tj/589243 to your computer and use it in GitHub Desktop.
Save tj/589243 to your computer and use it in GitHub Desktop.
if foo
if bar
something
else
something_else
else
baz
if foo
if bar
something
else
something_else
else
baz
#include <stdio.h>
#include <ctype.h>
#include <string.h>
/*
* Macros.
*/
#define next fgetc(self->stream)
#define undo ungetc(c, self->stream)
/*
* Tokens.
*/
typedef enum {
tEOS
, tSEP
, tIllegal
, tIndent
, tOutdent
, tId
} Token;
/*
* Token names.
*/
static char *tokenNames[] = {
"EOS"
, "SEP"
, "Illegal"
, "Indent"
, "Outdent"
, "Id"
};
/*
* Lexer.
*/
typedef struct {
short indents;
short outdents;
FILE *stream;
char *filename;
char *err;
union {
char *asString;
} val;
} Lexer;
/*
* Initialize lexer with the given stream and filename.
*/
void
Lexer_init(Lexer *self, FILE *stream, char *filename) {
self->stream = stream;
self->filename = filename;
self->indents = 0;
self->outdents = 0;
self->err = NULL;
}
/*
* EOF
* | Outdent
*/
static Token
eof(Lexer *self) {
return self->indents--
? tOutdent
: tEOS;
}
/*
* SEP
* | Indent
* | Outdent
*/
static Token
indent(Lexer *self) {
int c, n = 0;
while (' ' == (c = next)) ++n;
undo;
if (n % 2) {
self->err = "Invalid indentation, must be multiple of 2.";
return tIllegal;
}
n /= 2;
// To many
if (n > self->indents + 1) {
self->err = "Invalid indentation, to many successive indents.";
return tIllegal;
// More
} else if (n > self->indents) {
++self->indents;
return tIndent;
// SEP
} else if (n == self->indents) {
return tSEP;
// Less
} else {
self->outdents = self->indents - n - 1;
self->indents = n;
return tOutdent;
}
}
/*
* Id :: [a-zA-Z_]+
*/
static Token
id(Lexer *self, int c) {
int i = 0;
char buf[32];
buf[i++] = c;
while (isalpha(c = next) || c == '_') {
buf[i++] = c;
}
undo;
buf[i] = '\0';
self->val.asString = strdup(buf);
return tId;
}
/*
* Return the next token.
*/
Token
Lexer_next(Lexer *self) {
int c;
// Deferred outdents
if (self->outdents) {
--self->outdents;
return tOutdent;
}
switch (c = next) {
case EOF:
return eof(self);
case ' ':
return Lexer_next(self);
case '\n':
return indent(self);
default:
if (isalpha(c) || c == '_') return id(self, c);
return tIllegal;
}
}
/*
* Dump lexer tokens to stdout.
*/
void
Lexer_dump(Lexer *self) {
Token tok;
printf("\x1b[33m%s\x1b[0m:\n", self->filename);
while (tEOS != (tok = Lexer_next(self))) {
switch (tok) {
case tIllegal:
printf(" \x1b[31m%s\x1b[0m: %s\n", tokenNames[tok], self->err);
break;
case tId:
printf(" \x1b[33m%s\x1b[0m: %s\n", tokenNames[tok], self->val.asString);
break;
default:
printf(" \x1b[32m%s\x1b[0m\n", tokenNames[tok]);
break;
}
}
}
/*
* Initialize and dump lexer from stdin.
*/
int
main(int argc, const char **argv){
Lexer lex;
Lexer_init(&lex, stdin, "stdin");
Lexer_dump(&lex);
return 0;
}
Id: if
Id: foo
Indent
Id: if
Id: bar
Indent
Id: something
Outdent
Id: else
Illegal: Invalid indentation, to many successive indents.
Id: something_else
Outdent
Id: else
Indent
Id: baz
Outdent
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment