Skip to content

Instantly share code, notes, and snippets.

@nilium
Created March 13, 2010 08:24
Show Gist options
  • Select an option

  • Save nilium/331200 to your computer and use it in GitHub Desktop.

Select an option

Save nilium/331200 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <ctype.h>
#include <string.h>
#include "lexer.h"
static void tokenizer_tokens_fit(tokenizer_t *tokr, size_t n);
static token_t *tokenizer_new_token(tokenizer_t *tokr);
static token_t *tokenizer_merge_tokens(tokenizer_t *tokr, int from, int to);
static token_mark_t tokenizer_mark(tokenizer_t *tokr);
static void tokenizer_reset(tokenizer_t *tokr, token_mark_t mark);
static char tokenizer_current(tokenizer_t *tokr);
static bool tokenizer_has_next(tokenizer_t *tokr);
static char tokenizer_next(tokenizer_t *tokr);
static char tokenizer_peek(tokenizer_t *tokr);
static void tokenizer_skip_whitespace(tokenizer_t *tokr);
static token_t tokenizer_read_base_number(tokenizer_t *tokr);
static token_kind_t token_kind_for_single(const char *single, size_t len);
static token_t tokenizer_read_number(tokenizer_t *tokr);
token_t tokenizer_read_word(tokenizer_t *tokr);
static token_t tokenizer_read_string(tokenizer_t *tokr);
static token_t tokenizer_read_line_comment(tokenizer_t *tokr);
const int TOKENIZER_INITIAL_CAPACITY = 500;
static const char *token_strings[] = {
"INVALID",
NULL,
"End",
"Function",
"End Function",
"Method",
"End Method",
"Type",
"Extends",
"Abstract",
"Final",
"No Debug",
"End Type",
"Extern",
"End Extern",
"Rem",
"End Rem",
"Float",
"Double",
"Byte",
"Short",
"Int",
"String",
"Object",
"Local",
"Global",
"Const",
"VarPtr",
"Ptr",
"Var",
"Null",
"Strict",
"SuperStrict",
"Framework",
"Module",
"ModuleInfo",
"Import",
"Include",
"Private",
"Public",
"Or",
"And",
"Shr",
"Shl",
"Sar",
"Mod",
"Not",
"While",
"Wend",
"End While",
"For",
"Next",
"Until",
"To",
"EachIn",
"Repeat",
"Forever",
"If",
"End If",
"Else",
"Else If",
"Then",
"Select",
"Case",
"Default",
"End Select",
"Self",
"Super",
"Pi",
"New",
"Protocol",
"End Protocol",
"Auto",
"Implements",
":",
"?",
"!",
"#",
".",
"..",
"...",
"@",
"@@",
"$",
"%",
"'",
"(",
")",
"[",
"]",
"{",
"}",
">",
"<",
"=",
"-",
"+",
"*",
"^",
"~",
"`",
"\\",
"/",
",",
";",
"|",
"&",
"\\n",
":+",
":-",
":/",
":*",
":^",
":Shl",
":Shr",
":Sar",
":Mod",
":~",
":&",
":|",
":=",
"--",
"++",
NULL,
NULL,
NULL,
NULL,
"' ...",
"Rem ... End Rem",
};
typedef struct s_token_single {
token_kind_t kind;
const char *matches;
bool case_sensitive;
} token_single_t;
static token_single_t const token_singles[] = {
{ .kind = TOK_END_KW, .case_sensitive = false, .matches = "end" },
{ .kind = TOK_FUNCTION_KW, .case_sensitive = false, .matches = "function" },
{ .kind = TOK_ENDFUNCTION_KW, .case_sensitive = false, .matches = "endfunction" },
{ .kind = TOK_METHOD_KW, .case_sensitive = false, .matches = "method" },
{ .kind = TOK_ENDMETHOD_KW, .case_sensitive = false, .matches = "endmethod" },
{ .kind = TOK_TYPE_KW, .case_sensitive = false, .matches = "type" },
{ .kind = TOK_EXTENDS_KW, .case_sensitive = false, .matches = "extends" },
{ .kind = TOK_ABSTRACT_KW, .case_sensitive = false, .matches = "abstract" },
{ .kind = TOK_FINAL_KW, .case_sensitive = false, .matches = "final" },
{ .kind = TOK_NODEBUG_KW, .case_sensitive = false, .matches = "nodebug" },
{ .kind = TOK_ENDTYPE_KW, .case_sensitive = false, .matches = "endtype" },
{ .kind = TOK_EXTERN_KW, .case_sensitive = false, .matches = "extern" },
{ .kind = TOK_ENDEXTERN_KW, .case_sensitive = false, .matches = "endextern" },
{ .kind = TOK_REM_KW, .case_sensitive = false, .matches = "rem" },
{ .kind = TOK_ENDREM_KW, .case_sensitive = false, .matches = "endrem" },
{ .kind = TOK_FLOAT_KW, .case_sensitive = false, .matches = "float" },
{ .kind = TOK_DOUBLE_KW, .case_sensitive = false, .matches = "double" },
{ .kind = TOK_BYTE_KW, .case_sensitive = false, .matches = "byte" },
{ .kind = TOK_SHORT_KW, .case_sensitive = false, .matches = "short" },
{ .kind = TOK_INT_KW, .case_sensitive = false, .matches = "int" },
{ .kind = TOK_STRING_KW, .case_sensitive = false, .matches = "string" },
{ .kind = TOK_OBJECT_KW, .case_sensitive = false, .matches = "object" },
{ .kind = TOK_LOCAL_KW, .case_sensitive = false, .matches = "local" },
{ .kind = TOK_GLOBAL_KW, .case_sensitive = false, .matches = "global" },
{ .kind = TOK_CONST_KW, .case_sensitive = false, .matches = "const" },
{ .kind = TOK_VARPTR_KW, .case_sensitive = false, .matches = "varptr" },
{ .kind = TOK_PTR_KW, .case_sensitive = false, .matches = "ptr" },
{ .kind = TOK_VAR_KW, .case_sensitive = false, .matches = "var" },
{ .kind = TOK_NULL_KW, .case_sensitive = false, .matches = "null" },
{ .kind = TOK_STRICT_KW, .case_sensitive = false, .matches = "strict" },
{ .kind = TOK_SUPERSTRICT_KW, .case_sensitive = false, .matches = "superstrict" },
{ .kind = TOK_FRAMEWORK_KW, .case_sensitive = false, .matches = "framework" },
{ .kind = TOK_MODULE_KW, .case_sensitive = false, .matches = "module" },
{ .kind = TOK_MODULEINFO_KW, .case_sensitive = false, .matches = "moduleinfo" },
{ .kind = TOK_IMPORT_KW, .case_sensitive = false, .matches = "import" },
{ .kind = TOK_INCLUDE_KW, .case_sensitive = false, .matches = "include" },
{ .kind = TOK_PRIVATE_KW, .case_sensitive = false, .matches = "private" },
{ .kind = TOK_PUBLIC_KW, .case_sensitive = false, .matches = "public" },
{ .kind = TOK_OR_KW, .case_sensitive = false, .matches = "or" },
{ .kind = TOK_AND_KW, .case_sensitive = false, .matches = "and" },
{ .kind = TOK_SHR_KW, .case_sensitive = false, .matches = "shr" },
{ .kind = TOK_SHL_KW, .case_sensitive = false, .matches = "shl" },
{ .kind = TOK_SAR_KW, .case_sensitive = false, .matches = "sar" },
{ .kind = TOK_MOD_KW, .case_sensitive = false, .matches = "mod" },
{ .kind = TOK_NOT_KW, .case_sensitive = false, .matches = "not" },
{ .kind = TOK_WHILE_KW, .case_sensitive = false, .matches = "while" },
{ .kind = TOK_WEND_KW, .case_sensitive = false, .matches = "wend" },
{ .kind = TOK_ENDWHILE_KW, .case_sensitive = false, .matches = "endwhile" },
{ .kind = TOK_FOR_KW, .case_sensitive = false, .matches = "for" },
{ .kind = TOK_NEXT_KW, .case_sensitive = false, .matches = "next" },
{ .kind = TOK_UNTIL_KW, .case_sensitive = false, .matches = "until" },
{ .kind = TOK_TO_KW, .case_sensitive = false, .matches = "to" },
{ .kind = TOK_EACHIN_KW, .case_sensitive = false, .matches = "eachin" },
{ .kind = TOK_REPEAT_KW, .case_sensitive = false, .matches = "repeat" },
{ .kind = TOK_FOREVER_KW, .case_sensitive = false, .matches = "forever" },
{ .kind = TOK_IF_KW, .case_sensitive = false, .matches = "if" },
{ .kind = TOK_ENDIF_KW, .case_sensitive = false, .matches = "endif" },
{ .kind = TOK_ELSE_KW, .case_sensitive = false, .matches = "else" },
{ .kind = TOK_ELSEIF_KW, .case_sensitive = false, .matches = "elseif" },
{ .kind = TOK_THEN_KW, .case_sensitive = false, .matches = "then" },
{ .kind = TOK_SELECT_KW, .case_sensitive = false, .matches = "select" },
{ .kind = TOK_CASE_KW, .case_sensitive = false, .matches = "case" },
{ .kind = TOK_DEFAULT_KW, .case_sensitive = false, .matches = "default" },
{ .kind = TOK_ENDSELECT_KW, .case_sensitive = false, .matches = "endselect" },
{ .kind = TOK_SELF_KW, .case_sensitive = false, .matches = "self" },
{ .kind = TOK_SUPER_KW, .case_sensitive = false, .matches = "super" },
{ .kind = TOK_PI_KW, .case_sensitive = false, .matches = "pi" },
{ .kind = TOK_NEW_KW, .case_sensitive = false, .matches = "new" },
{ .kind = TOK_PROTOCOL_KW, .case_sensitive = false, .matches = "protocol" },
{ .kind = TOK_ENDPROTOCOL_KW, .case_sensitive = false, .matches = "endprotocol" },
{ .kind = TOK_AUTO_KW, .case_sensitive = false, .matches = "auto" },
{ .kind = TOK_IMPLEMENTS_KW, .case_sensitive = false, .matches = "implements" },
{ .kind = TOK_COLON, .case_sensitive = false, .matches = ":" },
{ .kind = TOK_QUESTION, .case_sensitive = false, .matches = "?" },
{ .kind = TOK_BANG, .case_sensitive = false, .matches = "!" },
{ .kind = TOK_HASH, .case_sensitive = false, .matches = "#" },
{ .kind = TOK_DOLLAR, .case_sensitive = false, .matches = "$" },
{ .kind = TOK_PERCENT, .case_sensitive = false, .matches = "%" },
{ .kind = TOK_OPENPAREN, .case_sensitive = false, .matches = "(" },
{ .kind = TOK_CLOSEPAREN, .case_sensitive = false, .matches = ")" },
{ .kind = TOK_OPENBRACKET, .case_sensitive = false, .matches = "[" },
{ .kind = TOK_CLOSEBRACKET, .case_sensitive = false, .matches = "]" },
{ .kind = TOK_OPENCURL, .case_sensitive = false, .matches = "{" },
{ .kind = TOK_CLOSECURL, .case_sensitive = false, .matches = "}" },
{ .kind = TOK_GREATERTHAN, .case_sensitive = false, .matches = ">" },
{ .kind = TOK_LESSTHAN, .case_sensitive = false, .matches = "<" },
{ .kind = TOK_EQUALS, .case_sensitive = false, .matches = "=" },
{ .kind = TOK_MINUS, .case_sensitive = false, .matches = "-" },
{ .kind = TOK_PLUS, .case_sensitive = false, .matches = "+" },
{ .kind = TOK_ASTERISK, .case_sensitive = false, .matches = "*" },
{ .kind = TOK_CARET, .case_sensitive = false, .matches = "^" },
{ .kind = TOK_TILDE, .case_sensitive = false, .matches = "~" },
{ .kind = TOK_GRAVE, .case_sensitive = false, .matches = "`" },
{ .kind = TOK_BACKSLASH, .case_sensitive = false, .matches = "\\" },
{ .kind = TOK_SLASH, .case_sensitive = false, .matches = "/" },
{ .kind = TOK_COMMA, .case_sensitive = false, .matches = "," },
{ .kind = TOK_SEMICOLON, .case_sensitive = false, .matches = ";" },
{ .kind = TOK_PIPE, .case_sensitive = false, .matches = "|" },
{ .kind = TOK_AMPERSAND, .case_sensitive = false, .matches = "&" },
{ .kind = TOK_NEWLINE, .case_sensitive = false, .matches = "\n" },
{ .kind = TOK_INVALID, .case_sensitive = false, .matches = NULL },
};
typedef struct s_token_pair {
token_kind_t left, right;
token_kind_t kind;
size_t range;
} token_pair_t;
static token_pair_t const token_pairs[] = {
{ .left = TOK_END_KW, .right = TOK_REM_KW, .kind = TOK_ENDREM_KW, .range = 1 }, // "End Rem"
{ .left = TOK_END_KW, .right = TOK_METHOD_KW, .kind = TOK_ENDMETHOD_KW, .range = 1 }, // "End Method"
{ .left = TOK_END_KW, .right = TOK_FUNCTION_KW, .kind = TOK_ENDFUNCTION_KW, .range = 1 }, // "End Function"
{ .left = TOK_END_KW, .right = TOK_TYPE_KW, .kind = TOK_ENDTYPE_KW, .range = 1 }, // "End Type"
{ .left = TOK_END_KW, .right = TOK_EXTERN_KW, .kind = TOK_ENDEXTERN_KW, .range = 1 }, // "End Extern"
{ .left = TOK_END_KW, .right = TOK_IF_KW, .kind = TOK_ENDIF_KW, .range = 1 }, // "End If"
{ .left = TOK_END_KW, .right = TOK_SELECT_KW, .kind = TOK_ENDSELECT_KW, .range = 1 }, // "End Select"
{ .left = TOK_END_KW, .right = TOK_WHILE_KW, .kind = TOK_ENDWHILE_KW, .range = 1 }, // "End While"
{ .left = TOK_END_KW, .right = TOK_PROTOCOL_KW, .kind = TOK_ENDPROTOCOL_KW, .range = 1 }, // "End Protocol" (x)
{ .left = TOK_COLON, .right = TOK_PLUS, .kind = TOK_ASSIGN_ADD, .range = 0 }, // :+
{ .left = TOK_COLON, .right = TOK_MINUS, .kind = TOK_ASSIGN_SUBTRACT, .range = 0 }, // :-
{ .left = TOK_COLON, .right = TOK_SLASH, .kind = TOK_ASSIGN_DIVIDE, .range = 0 }, // :/
{ .left = TOK_COLON, .right = TOK_ASTERISK, .kind = TOK_ASSIGN_MULTIPLY, .range = 0 }, // :*
{ .left = TOK_COLON, .right = TOK_CARET, .kind = TOK_ASSIGN_POWER, .range = 0 }, // :^
{ .left = TOK_COLON, .right = TOK_SHL_KW, .kind = TOK_ASSIGN_SHL, .range = 0 }, // :Shl
{ .left = TOK_COLON, .right = TOK_SHR_KW, .kind = TOK_ASSIGN_SHR, .range = 0 }, // :Shr
{ .left = TOK_COLON, .right = TOK_SAR_KW, .kind = TOK_ASSIGN_SAR, .range = 0 }, // :Sar
{ .left = TOK_COLON, .right = TOK_MOD_KW, .kind = TOK_ASSIGN_MOD, .range = 0 }, // :Mod
{ .left = TOK_COLON, .right = TOK_TILDE, .kind = TOK_ASSIGN_XOR, .range = 0 }, // :~
{ .left = TOK_COLON, .right = TOK_AMPERSAND, .kind = TOK_ASSIGN_AND, .range = 0 }, // :&
{ .left = TOK_COLON, .right = TOK_PIPE, .kind = TOK_ASSIGN_OR, .range = 0 }, // :|
{ .left = TOK_COLON, .right = TOK_EQUALS, .kind = TOK_ASSIGN_AUTO, .range = 0 }, // := (x)
{ .left = TOK_MINUS, .right = TOK_MINUS, .kind = TOK_DOUBLEMINUS, .range = 0 }, // -- (x)
{ .left = TOK_PLUS, .right = TOK_PLUS, .kind = TOK_DOUBLEPLUS, .range = 0 }, // ++ (x)
// { .left = TOK_MINUS, .right = TOK_NUMBER_LIT, .kind = TOK_NUMBER_LIT, .range = 0 }, // -NUMBER
{ .left = TOK_INVALID, .right = TOK_INVALID, .kind = TOK_INVALID, .range = -1 },
};
const char *token_to_string(token_t tok) {
if (tok.kind == TOK_EOF)
return "<EOF>";
return token_strings[tok.kind];
}
tokenizer_t *tokenizer_init(tokenizer_t *tokr, const char *source_begin, const char *source_end) {
if (tokr == NULL || source_begin == NULL || source_end == NULL || source_begin > source_end) {
return NULL;
}
tokr->capacity = 0;
tokr->tokens = NULL;
tokr->source_begin = source_begin;
tokr->source_end = source_end;
tokr->current.place = source_begin;
tokr->current.line = 1;
tokr->current.column = 1;
tokr->current.token = 0;
tokenizer_tokens_fit(tokr, TOKENIZER_INITIAL_CAPACITY);
return tokr;
}
void tokenizer_destroy(tokenizer_t *tokr) {
if (tokr == NULL) {
return;
}
if (tokr->tokens != NULL) {
free(tokr->tokens);
tokr->tokens = NULL;
}
tokr->source_begin = NULL;
tokr->source_end = NULL;
tokr->current.place = NULL;
}
static void tokenizer_tokens_fit(tokenizer_t *tokr, size_t n) {
if (n < tokr->capacity) {
return;
}
size_t sz = tokr->capacity*2;
if (sz < n) {
sz = n;
}
tokr->tokens = realloc(tokr->tokens, sz*sizeof(token_t));
tokr->capacity = sz;
}
static token_t *tokenizer_new_token(tokenizer_t *tokr) {
int index = tokr->current.token + 1;
tokenizer_tokens_fit(tokr, index+1);
token_t *token = tokr->tokens+tokr->current.token;
tokr->current.token = index;
token->kind = TOK_INVALID;
token->from = token->to = NULL;
token->line = 0;
token->column = 0;
return token;
}
static token_t *tokenizer_merge_tokens(tokenizer_t *tokr, int from, int to) {
tokr->tokens[from].to = tokr->tokens[to].to;
int offset = from - to;
int idx = to+1;
for (; idx < tokr->current.token; ++idx)
tokr->tokens[idx+offset] = tokr->tokens[idx];
tokr->current.token += offset;
return NULL;
}
static token_mark_t tokenizer_mark(tokenizer_t *tokr) {
return tokr->current;
}
static void tokenizer_reset(tokenizer_t *tokr, token_mark_t mark) {
tokr->current = mark;
}
static char tokenizer_current(tokenizer_t *tokr) {
if (tokr->source_end < tokr->current.place)
return 0;
return *(tokr->current.place);
}
static bool tokenizer_has_next(tokenizer_t *tokr) {
return (bool)((tokr->current.place) < tokr->source_end);
}
static char tokenizer_next(tokenizer_t *tokr) {
if (tokenizer_current(tokr) == '\n') {
tokr->current.line += 1;
tokr->current.column = 1;
} else {
++tokr->current.column;
}
return tokenizer_has_next(tokr) ? *(++tokr->current.place) : 0;
}
static char tokenizer_peek(tokenizer_t *tokr) {
return tokenizer_has_next(tokr) ? *tokr->current.place : 0;
}
static void tokenizer_skip_whitespace(tokenizer_t *tokr) {
char cur;
while ((cur = tokenizer_current(tokr)) != 0 &&
(cur == ' ' || cur == '\t' || cur == '\r')) {
tokenizer_next(tokr);
}
}
static token_t tokenizer_read_base_number(tokenizer_t *tokr) {
char cur = tokenizer_current(tokr);
token_mark_t mark = tokenizer_mark(tokr);
token_t token = {
.kind = TOK_NUMBER_LIT,
.line = mark.line,
.column = mark.column,
.from = mark.place,
.to = NULL,
};
if (cur == '%') { // bin
while (tokenizer_has_next(tokr) && (cur = tokenizer_next(tokr)) == '0' || cur == '1');
} else if (cur == '$') { // hex
while (tokenizer_has_next(tokr) && isxdigit(tokenizer_next(tokr)));
} else {
fprintf(stderr, "[%d:%d] Malformed number literal encountered, not a number\n", tokr->current.line, tokr->current.column);
exit(1);
}
token.to = tokr->current.place;
tokenizer_next(tokr);
return token;
}
static token_kind_t token_kind_for_single(const char *single, size_t len) {
const token_single_t* iter = token_singles;
while (iter->kind != TOK_INVALID) {
if (strlen(iter->matches) == len &&
(iter->case_sensitive
? strncmp(iter->matches, single, len)
: strncasecmp(iter->matches, single, len)) == 0) {
break;
}
++iter;
}
return iter->kind;
}
static token_t tokenizer_read_number(tokenizer_t *tokr) {
char cur = tokenizer_current(tokr);
token_mark_t mark = tokenizer_mark(tokr);
bool isDec = (cur == '.');
bool isExp = false;
token_t token = {
.kind = TOK_NUMBER_LIT,
.line = mark.line,
.column = mark.column,
.from = mark.place,
.to = NULL,
};
while (tokenizer_has_next(tokr) && (cur = tokenizer_next(tokr)) != 0) {
if (cur == '.') {
if (isDec) {
break;
}
isDec = true;
}
if (tolower(cur) == 'e') {
if (isExp) {
fprintf(stderr, "[%d:%d] Malformed number literal encountered, exponent already provided\n", tokr->current.line, tokr->current.column);
exit(1);
}
isExp = true;
cur = tokenizer_peek(tokr);
if (cur == '-' || cur == '+') {
tokenizer_next(tokr);
cur = tokenizer_peek(tokr);
}
if (!isdigit(cur)) {
fprintf(stderr, "[%d:%d] Malformed number literal encountered, exponent expected but not found\n", tokr->current.line, tokr->current.column);
exit(1);
}
}
if (!isdigit(cur)) {
break;
}
}
token.to = tokr->current.place;
tokenizer_next(tokr);
return token;
}
token_t tokenizer_read_word(tokenizer_t *tokr) {
token_mark_t mark = tokenizer_mark(tokr);
token_t token = {
.kind = TOK_ID,
.line = mark.line,
.column = mark.column,
.from = mark.place,
.to = NULL,
};
while (tokenizer_has_next(tokr)) {
char cur = tokenizer_peek(tokr);
if (cur != '_' && !isalnum(cur)) {
break;
}
tokenizer_next(tokr);
}
tokenizer_next(tokr);
token.to = tokr->current.place-1;
token_kind_t alter = token_kind_for_single(token.from, (size_t)(token.to-token.from));
if (alter != TOK_INVALID) {
token.kind = alter;
}
return token;
}
static token_t tokenizer_read_string(tokenizer_t *tokr) {
char cur = tokenizer_current(tokr);
token_mark_t mark = tokenizer_mark(tokr);
token_t token = {
.kind = TOK_STRING_LIT,
.line = mark.line,
.column = mark.column,
.from = mark.place,
.to = NULL,
};
while (tokenizer_has_next(tokr) && (cur = tokenizer_next(tokr)) != '"') {
if (cur == '\n') {
fprintf(stderr, "[%d:%d] String literal does not terminate before newline or EOF\n", tokr->current.line, tokr->current.column);
exit(1);
}
}
token.to = tokr->current.place;
tokenizer_next(tokr);
return token;
}
static token_t tokenizer_read_line_comment(tokenizer_t *tokr) {
char cur = tokenizer_current(tokr);
token_mark_t mark = tokenizer_mark(tokr);
token_t token = {
.kind = TOK_LINE_COMMENT,
.line = mark.line,
.column = mark.column,
.from = mark.place,
.to = NULL,
};
do {
cur = tokenizer_next(tokr);
} while(cur != 0 && cur != '\n');
token.to = tokr->current.place;
tokenizer_next(tokr);
return token;
}
void tokenizer_run(tokenizer_t *tokr) {
char twochar[2] = {0,0};
token_mark_t mark;
token_t comment = {.kind=TOK_INVALID};
token_t token;
char cur;
while(tokenizer_current(tokr) != 0) {
token.kind = TOK_INVALID;
tokenizer_skip_whitespace(tokr);
mark = tokenizer_mark(tokr);
cur = tokenizer_current(tokr);
if (comment.kind == TOK_INVALID) {
if (cur == '@') {
token.kind = TOK_AT;
if (tokenizer_next(tokr) == '@') {
token.kind = TOK_DOUBLEAT;
tokenizer_next(tokr);
}
token.from = mark.place;
token.to = tokr->current.place-1;
token.line = mark.line;
token.column = mark.column;
}
if (cur == '.') {
if (isdigit(tokenizer_peek(tokr))) {
token = tokenizer_read_number(tokr);
} else {
token.kind = TOK_DOT;
while(token.kind <= TOK_TRIPLEDOT && tokenizer_next(tokr) == '.') {
++token.kind;
}
token.from = mark.place;
token.to = tokr->current.place-1;
token.line = mark.line;
token.column = mark.column;
}
}
if (cur == '\'') {
token = tokenizer_read_line_comment(tokr);
}
if (cur == '%') {
char peek = tokenizer_peek(tokr);
if (peek == '1' || peek == '0') {
token = tokenizer_read_base_number(tokr);
}
}
if (cur == '$' && isxdigit(tokenizer_peek(tokr))) {
token = tokenizer_read_base_number(tokr);
}
if (cur == '"') {
token = tokenizer_read_string(tokr);
}
if (isdigit(cur)) {
token = tokenizer_read_number(tokr);
}
if (token.kind == TOK_INVALID) {
token_kind_t alter = token_kind_for_single(&cur, 1);
if (alter != TOK_INVALID) {
token.kind = alter;
token.from = token.to = mark.place;
token.line = mark.line;
token.column = mark.column;
tokenizer_next(tokr);
}
}
}
if (cur == '_' || isalpha(cur)) {
token = tokenizer_read_word(tokr);
}
if (comment.kind != TOK_INVALID) {
if (token.kind == TOK_END_KW) {
if (tokenizer_current(tokr) == ' ') {
tokenizer_next(tokr);
}
if ((cur = tokenizer_current(tokr)) == '_' || isalpha(cur)) {
token_mark_t next_mark = tokenizer_mark(tokr);
token_t next = tokenizer_read_word(tokr);
if (next.kind == TOK_REM_KW) {
token.kind = TOK_ENDREM_KW;
token.to = next.to;
} else {
tokenizer_reset(tokr, next_mark);
}
}
}
if (token.kind == TOK_ENDREM_KW) {
token_t block = {
.kind = TOK_BLOCK_COMMENT,
.line = comment.line,
.column = comment.column,
.from = comment.to + 1,
.to = token.from - 1,
};
comment.kind = TOK_INVALID;
*tokenizer_new_token(tokr) = block;
}
if (token.kind == TOK_INVALID) {
tokenizer_next(tokr);
tokenizer_skip_whitespace(tokr);
}
}
if (token.kind != TOK_INVALID && comment.kind == TOK_INVALID) {
*tokenizer_new_token(tokr) = token;
}
if (comment.kind == TOK_INVALID && token.kind == TOK_REM_KW) {
comment = token;
}
if (token.kind == TOK_INVALID) {
fprintf(stderr, "[%d:%d] Invalid token: %c\n", tokr->current.line, tokr->current.column, cur);
exit(1);
}
}
tokenizer_new_token(tokr)->kind = TOK_EOF;
unsigned int tok_index = 0;
while (tokr->tokens[tok_index].kind != TOK_EOF) {
token_t left, right;
bool merged = false;
left = tokr->tokens[tok_index];
right = tokr->tokens[tok_index+1];
const token_pair_t *pair_iter = token_pairs;
while (pair_iter->left != TOK_INVALID && !merged) {
if (pair_iter->left == left.kind && pair_iter->right == right.kind &&
right.from <= left.to+pair_iter->range) {
tokenizer_merge_tokens(tokr, tok_index, tok_index+1);
merged = true;
}
++pair_iter;
}
if (!merged)
++tok_index;
}
}
#ifndef LEXER_H_BICMCZIT
#define LEXER_H_BICMCZIT
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
TOK_EOF=-1,
TOK_INVALID=0,
TOK_ID,
TOK_END_KW,
TOK_FUNCTION_KW,
TOK_ENDFUNCTION_KW,
TOK_METHOD_KW,
TOK_ENDMETHOD_KW,
TOK_TYPE_KW,
TOK_EXTENDS_KW,
TOK_ABSTRACT_KW,
TOK_FINAL_KW,
TOK_NODEBUG_KW,
TOK_ENDTYPE_KW,
TOK_EXTERN_KW,
TOK_ENDEXTERN_KW,
TOK_REM_KW,
TOK_ENDREM_KW,
TOK_FLOAT_KW,
TOK_DOUBLE_KW,
TOK_BYTE_KW,
TOK_SHORT_KW,
TOK_INT_KW,
TOK_STRING_KW,
TOK_OBJECT_KW,
TOK_LOCAL_KW,
TOK_GLOBAL_KW,
TOK_CONST_KW,
TOK_VARPTR_KW,
TOK_PTR_KW,
TOK_VAR_KW,
TOK_NULL_KW,
TOK_STRICT_KW,
TOK_SUPERSTRICT_KW,
TOK_FRAMEWORK_KW,
TOK_MODULE_KW,
TOK_MODULEINFO_KW,
TOK_IMPORT_KW,
TOK_INCLUDE_KW,
TOK_PRIVATE_KW,
TOK_PUBLIC_KW,
TOK_OR_KW,
TOK_AND_KW,
TOK_SHR_KW,
TOK_SHL_KW,
TOK_SAR_KW,
TOK_MOD_KW,
TOK_NOT_KW,
TOK_WHILE_KW,
TOK_WEND_KW,
TOK_ENDWHILE_KW,
TOK_FOR_KW,
TOK_NEXT_KW,
TOK_UNTIL_KW,
TOK_TO_KW,
TOK_EACHIN_KW,
TOK_REPEAT_KW,
TOK_FOREVER_KW,
TOK_IF_KW,
TOK_ENDIF_KW,
TOK_ELSE_KW,
TOK_ELSEIF_KW,
TOK_THEN_KW,
TOK_SELECT_KW,
TOK_CASE_KW,
TOK_DEFAULT_KW,
TOK_ENDSELECT_KW,
TOK_SELF_KW,
TOK_SUPER_KW,
TOK_PI_KW,
TOK_NEW_KW,
// extensions
TOK_PROTOCOL_KW,
TOK_ENDPROTOCOL_KW,
TOK_AUTO_KW,
TOK_IMPLEMENTS_KW,
TOK_COLON,
TOK_QUESTION,
TOK_BANG,
TOK_HASH,
TOK_DOT,
TOK_DOUBLEDOT,
TOK_TRIPLEDOT,
TOK_AT,
TOK_DOUBLEAT,
TOK_DOLLAR,
TOK_PERCENT,
TOK_SINGLEQUOTE,
TOK_OPENPAREN,
TOK_CLOSEPAREN,
TOK_OPENBRACKET,
TOK_CLOSEBRACKET,
TOK_OPENCURL,
TOK_CLOSECURL,
TOK_GREATERTHAN,
TOK_LESSTHAN,
TOK_EQUALS,
TOK_MINUS,
TOK_PLUS,
TOK_ASTERISK,
TOK_CARET,
TOK_TILDE,
TOK_GRAVE,
TOK_BACKSLASH,
TOK_SLASH,
TOK_COMMA,
TOK_SEMICOLON,
TOK_PIPE,
TOK_AMPERSAND,
TOK_NEWLINE,
TOK_ASSIGN_ADD,
TOK_ASSIGN_SUBTRACT,
TOK_ASSIGN_DIVIDE,
TOK_ASSIGN_MULTIPLY,
TOK_ASSIGN_POWER,
TOK_ASSIGN_SHL,
TOK_ASSIGN_SHR,
TOK_ASSIGN_SAR,
TOK_ASSIGN_MOD,
TOK_ASSIGN_XOR,
TOK_ASSIGN_AND,
TOK_ASSIGN_OR,
TOK_ASSIGN_AUTO,
TOK_DOUBLEMINUS,
TOK_DOUBLEPLUS,
TOK_NUMBER_LIT,
TOK_HEX_LIT,
TOK_BIN_LIT,
TOK_STRING_LIT,
TOK_LINE_COMMENT,
TOK_BLOCK_COMMENT,
} token_kind_t;
typedef struct s_token_mark {
const char *place;
unsigned int line, column;
unsigned int token;
} token_mark_t;
typedef struct s_token {
token_kind_t kind;
const char *from, *to;
unsigned int line, column;
} token_t;
typedef struct s_tokenizer {
size_t capacity;
token_t *tokens;
const char *source_begin, *source_end;
token_mark_t current;
} tokenizer_t;
tokenizer_t *tokenizer_init(tokenizer_t *tokr, const char *source_begin, const char *source_end);
void tokenizer_destroy(tokenizer_t *tokr);
void tokenizer_run(tokenizer_t *tokr);
const char *token_to_string(token_t tok);
#ifdef __cplusplus
}
#endif
#endif /* end of include guard: LEXER_H_BICMCZIT */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment