Created
September 21, 2010 05:11
-
-
Save tj/589243 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if foo | |
if bar | |
something | |
else | |
something_else | |
else | |
baz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if foo | |
if bar | |
something | |
else | |
something_else | |
else | |
baz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <ctype.h> | |
#include <string.h> | |
/* | |
* Macros. | |
*/ | |
#define next fgetc(self->stream) | |
#define undo ungetc(c, self->stream) | |
/* | |
* Tokens. | |
*/ | |
typedef enum { | |
tEOS | |
, tSEP | |
, tIllegal | |
, tIndent | |
, tOutdent | |
, tId | |
} Token; | |
/* | |
* Token names. | |
*/ | |
static char *tokenNames[] = { | |
"EOS" | |
, "SEP" | |
, "Illegal" | |
, "Indent" | |
, "Outdent" | |
, "Id" | |
}; | |
/* | |
* Lexer. | |
*/ | |
typedef struct { | |
short indents; | |
short outdents; | |
FILE *stream; | |
char *filename; | |
char *err; | |
union { | |
char *asString; | |
} val; | |
} Lexer; | |
/* | |
* Initialize lexer with the given stream and filename. | |
*/ | |
void | |
Lexer_init(Lexer *self, FILE *stream, char *filename) { | |
self->stream = stream; | |
self->filename = filename; | |
self->indents = 0; | |
self->outdents = 0; | |
self->err = NULL; | |
} | |
/* | |
* EOF | |
* | Outdent | |
*/ | |
static Token | |
eof(Lexer *self) { | |
return self->indents-- | |
? tOutdent | |
: tEOS; | |
} | |
/* | |
* SEP | |
* | Indent | |
* | Outdent | |
*/ | |
static Token | |
indent(Lexer *self) { | |
int c, n = 0; | |
while (' ' == (c = next)) ++n; | |
undo; | |
if (n % 2) { | |
self->err = "Invalid indentation, must be multiple of 2."; | |
return tIllegal; | |
} | |
n /= 2; | |
// To many | |
if (n > self->indents + 1) { | |
self->err = "Invalid indentation, to many successive indents."; | |
return tIllegal; | |
// More | |
} else if (n > self->indents) { | |
++self->indents; | |
return tIndent; | |
// SEP | |
} else if (n == self->indents) { | |
return tSEP; | |
// Less | |
} else { | |
self->outdents = self->indents - n - 1; | |
self->indents = n; | |
return tOutdent; | |
} | |
} | |
/* | |
* Id :: [a-zA-Z_]+ | |
*/ | |
static Token | |
id(Lexer *self, int c) { | |
int i = 0; | |
char buf[32]; | |
buf[i++] = c; | |
while (isalpha(c = next) || c == '_') { | |
buf[i++] = c; | |
} | |
undo; | |
buf[i] = '\0'; | |
self->val.asString = strdup(buf); | |
return tId; | |
} | |
/* | |
* Return the next token. | |
*/ | |
Token | |
Lexer_next(Lexer *self) { | |
int c; | |
// Deferred outdents | |
if (self->outdents) { | |
--self->outdents; | |
return tOutdent; | |
} | |
switch (c = next) { | |
case EOF: | |
return eof(self); | |
case ' ': | |
return Lexer_next(self); | |
case '\n': | |
return indent(self); | |
default: | |
if (isalpha(c) || c == '_') return id(self, c); | |
return tIllegal; | |
} | |
} | |
/* | |
* Dump lexer tokens to stdout. | |
*/ | |
void | |
Lexer_dump(Lexer *self) { | |
Token tok; | |
printf("\x1b[33m%s\x1b[0m:\n", self->filename); | |
while (tEOS != (tok = Lexer_next(self))) { | |
switch (tok) { | |
case tIllegal: | |
printf(" \x1b[31m%s\x1b[0m: %s\n", tokenNames[tok], self->err); | |
break; | |
case tId: | |
printf(" \x1b[33m%s\x1b[0m: %s\n", tokenNames[tok], self->val.asString); | |
break; | |
default: | |
printf(" \x1b[32m%s\x1b[0m\n", tokenNames[tok]); | |
break; | |
} | |
} | |
} | |
/* | |
* Initialize and dump lexer from stdin. | |
*/ | |
int | |
main(int argc, const char **argv){ | |
Lexer lex; | |
Lexer_init(&lex, stdin, "stdin"); | |
Lexer_dump(&lex); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Id: if | |
Id: foo | |
Indent | |
Id: if | |
Id: bar | |
Indent | |
Id: something | |
Outdent | |
Id: else | |
Illegal: Invalid indentation, to many successive indents. | |
Id: something_else | |
Outdent | |
Id: else | |
Indent | |
Id: baz | |
Outdent |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment