Created
October 22, 2016 00:08
-
-
Save dresswithpockets/ac382e5c9a06d3a583b8554e43252de2 to your computer and use it in GitHub Desktop.
stretch lexer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdafx.h" | |
using namespace std; | |
enum Compiler_Token { | |
T_EOF = 0x0, // don't wanna use 'EOF' since some headers have a #define for 'EOF' | |
IDENTIFIER = 0x1, | |
FLOAT_CONSTANT = 0x2, | |
INTEGER_CONSTANT = 0x3, | |
BINARY_CONSTANT, | |
HEX_CONSTANT, | |
STRING_CONSTANT, | |
CHAR_CONSTANT, | |
PUBLIC, | |
PRIVATE, | |
CASE, | |
DEFAULT, | |
CONTINUE, | |
BREAK, | |
RETURN, | |
ANY, | |
VOID, | |
BOOL, | |
SIGNED8, | |
SIGNED16, | |
SIGNED32, | |
SIGNED64, | |
UNSIGNED8, | |
UNSIGNED16, | |
UNSIGNED32, | |
UNSIGNED64, | |
FLOAT32, | |
FLOAT64, | |
STRING, | |
T_NULL, // prefixed with T_ as `NULL` is #defined | |
TRUE, | |
FALSE, | |
STRUCT, | |
ENUM, | |
ALIAS, | |
FOREIGN, | |
USING, | |
NEW, | |
DELETE, | |
DEFER, | |
SWITCH, | |
IF, | |
ELSE, | |
FOR, | |
WHILE, | |
DO, | |
PLUS, | |
MINUS, | |
ASTERISK, | |
SLASH, | |
MODULO, | |
INCREMENT, | |
DECREMENT, | |
SHIFT_LEFT, | |
SHIFT_RIGHT, | |
LESS, | |
GREATER, | |
LESS_EQUAL, | |
GREATER_EQUAL, | |
EQUALITY, | |
NO_EQUALITY, | |
LOGICAL_NOT, | |
LOGICAL_AND, | |
LOGICAL_OR, | |
XOR, | |
NOT, | |
AND, | |
OR, | |
QUESTION, | |
COLON, | |
DOUBLE_COLON, | |
COLONASSIGN, | |
ASSIGN, | |
MUL_ASSIGN, | |
DIV_ASSIGN, | |
MOD_ASSIGN, | |
ADD_ASSIGN, | |
SUB_ASSIGN, | |
LSHIFT_ASSIGN, | |
RSHIFT_ASSIGN, | |
AND_ASSIGN, | |
XOR_ASSIGN, | |
OR_ASSIGN, | |
DOT, | |
DOUBLE_DOT, | |
ARROW, | |
SEMICOLON, | |
LPAREN, | |
RPAREN, | |
LSQUARE, | |
RSQUARE, | |
LBRACKET, | |
RBRACKET, | |
COMMA, | |
NOINIT | |
}; | |
typedef pair<Compiler_Token, string> ct_pair; | |
vector<ct_pair> token_map = { | |
{PUBLIC, "public"}, | |
{PRIVATE, "private"}, | |
{CASE, "case"}, | |
{DEFAULT, "default"}, | |
{CONTINUE, "continue"}, | |
{BREAK, "break"}, | |
{RETURN, "return"}, | |
{ANY, "any"}, | |
{VOID, "void"}, | |
{BOOL, "bool"}, | |
{SIGNED8, "s8"}, | |
{SIGNED16, "s16"}, | |
{SIGNED32, "s32"}, | |
{SIGNED64, "s64"}, | |
{UNSIGNED8, "u8"}, | |
{UNSIGNED16, "u16"}, | |
{UNSIGNED32, "u32"}, | |
{UNSIGNED64, "u64"}, | |
{FLOAT32, "f32"}, | |
{FLOAT64, "f64"}, | |
{STRING, "string"}, | |
{T_NULL, "null"}, | |
{TRUE, "true"}, | |
{FALSE, "false"}, | |
{STRUCT, "struct"}, | |
{ENUM, "enum"}, | |
{ALIAS, "alias"}, | |
{FOREIGN, "foreign"}, | |
{USING, "using"}, | |
{NEW, "new"}, | |
{DELETE, "delete"}, | |
{DEFER, "defer"}, | |
{SWITCH, "switch"}, | |
{IF, "if"}, | |
{ELSE, "else"}, | |
{FOR, "for"}, | |
{WHILE, "while"}, | |
{DO, "do"}, | |
{PLUS, "+"}, | |
{MINUS, "-"}, | |
{ASTERISK, "*"}, | |
{SLASH, "/"}, | |
{MODULO, "%"}, | |
{INCREMENT, "++"}, | |
{DECREMENT, "--"}, | |
{LOGICAL_NOT, "!"}, | |
{SHIFT_LEFT, "<<"}, | |
{SHIFT_RIGHT, ">>"}, | |
{LESS, "<"}, | |
{GREATER, ">"}, | |
{LESS_EQUAL, "<="}, | |
{GREATER_EQUAL, ">="}, | |
{EQUALITY, "=="}, | |
{NO_EQUALITY, "!="}, | |
{LOGICAL_NOT, "!"}, | |
{LOGICAL_AND, "&&"}, | |
{LOGICAL_OR, "||"}, | |
{XOR, "^"}, | |
{NOT, "~"}, | |
{AND, "&"}, | |
{OR, "|"}, | |
{QUESTION, "?"}, | |
{COLON, ":"}, | |
{DOUBLE_COLON, "::"}, | |
{COLONASSIGN, ":="}, | |
{ASSIGN, "="}, | |
{MUL_ASSIGN, "*="}, | |
{DIV_ASSIGN, "/="}, | |
{MOD_ASSIGN, "%="}, | |
{ADD_ASSIGN, "+="}, | |
{SUB_ASSIGN, "-="}, | |
{LSHIFT_ASSIGN, "<<="}, | |
{RSHIFT_ASSIGN, ">>="}, | |
{AND_ASSIGN, "&="}, | |
{XOR_ASSIGN, "^="}, | |
{OR_ASSIGN, "|="}, | |
{DOT, "."}, | |
{DOUBLE_DOT, ".."}, | |
{ARROW, "->"}, | |
{SEMICOLON, ";"}, | |
{LPAREN, "{"}, | |
{RPAREN, "}"}, | |
{LSQUARE, "["}, | |
{RSQUARE, "]"}, | |
{LBRACKET, "{"}, | |
{RBRACKET, "}"}, | |
{COMMA, ","}, | |
{NOINIT, "---"} | |
}; | |
struct alpha_length_compare { | |
bool operator()(const ct_pair& lhs, const ct_pair& rhs) { | |
return lhs.second.length() > rhs.second.length(); | |
} | |
}; | |
// size macros | |
#define SIGNED8_MIN -128 | |
#define SIGNED8_MAX 127 | |
#define SIGNED16_MIN 32768 | |
#define SIGNED16_MAX -32768 | |
#define SIGNED32_MIN -2147483648 | |
#define SIGNED32_MAX 2147483647 | |
#define SIGNED64_MIN -9223372036854775808 | |
#define SIGNED64_MAX 9223372036854775807 | |
#define UNSIGNED_MIN 0 | |
#define UNSIGNED8_MAX 255 | |
#define UNSIGNED16_MAX 65535 | |
#define UNSIGNED32_MAX 4294967295 | |
#define UNSIGNED64_MAX 18446744073709551615 | |
#define FLOAT32_DIGIT_MIN 0 | |
#define FLOAT64_DIGIT_MIN 7 | |
const Compiler_Token Type_Specifiers[] = { VOID, BOOL, SIGNED8, SIGNED16, SIGNED32, SIGNED64, UNSIGNED8, UNSIGNED16, UNSIGNED32, UNSIGNED64, FLOAT32, FLOAT64, STRING, IDENTIFIER }; | |
const string identifier_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_"; | |
struct Token { | |
Compiler_Token type; | |
string value; | |
int line; | |
int character; | |
Token(Compiler_Token type, string value, int line, int character) | |
: type(type), value(value), line(line), character(character) {} | |
~Token() {} | |
}; | |
string to_string(Token& token) { | |
return string("Token(") + to_string((int)token.type) + | |
", " + token.value + ") @ " + to_string(token.line) + ":" + to_string(token.character); | |
} | |
bool isidchar(char id) { | |
for (int i = 0; i < identifier_characters.length(); i++) { | |
if (identifier_characters[i] == id) | |
return true; | |
} | |
return false; | |
} | |
struct Lexer { | |
string unit; | |
int position; | |
char current_char; | |
int line_index; | |
int char_index; | |
Lexer(string unit) | |
: unit(unit), position(0), current_char(unit[0]), line_index(0), char_index(0) { | |
sort(token_map.begin(), token_map.end(), alpha_length_compare()); | |
} | |
~Lexer() {} | |
void error(string message); | |
vector<Token> tokenize(); | |
void advance(int index); | |
char peek(int index); | |
void skip_whitespace(); | |
void skip_comment(); | |
void skip_line(); | |
Token get_number_constant(); | |
Token get_string_constant(); | |
Token get_char_constant(); | |
Token get_identifier(); | |
Token get_next_token(); | |
}; | |
void Lexer::error(string message) { | |
cout << "| " + to_string(this->line_index) + ":" + to_string(this->char_index) + " error: " + message; | |
} | |
vector<Token> Lexer::tokenize() { | |
Token token = this->get_next_token(); | |
vector<Token> tokens = { token }; | |
while (token.type != T_EOF) { | |
token = this->get_next_token(); | |
tokens.push_back(token); | |
} | |
return tokens; | |
} | |
void Lexer::advance(int index = 1) { | |
this->position += index; | |
if (this->position > this->unit.length() - 1) { | |
this->current_char = '\0'; | |
} | |
this->current_char = this->unit[this->position]; | |
} | |
char Lexer::peek(int index = 1) { | |
int peek_pos = this->position + index; | |
if (peek_pos > this->unit.length() - 1) { | |
return '\0'; | |
} | |
return this->unit[peek_pos]; | |
} | |
void Lexer::skip_whitespace() { | |
while (this->current_char != '\0' && isspace(this->current_char)) | |
this->advance(); | |
} | |
void Lexer::skip_comment() { | |
while (this->current_char != '\0' && | |
(this->current_char != '*' && this->peek() != '/')) { | |
this->advance(); | |
} | |
this->advance(2); | |
} | |
void Lexer::skip_line() { | |
while (this->current_char != '\0' && this->current_char != '\n') | |
this->advance(); | |
this->advance(); | |
} | |
#define l_token(a, b) Token(a, b, this->line_index, this->char_index) | |
Token Lexer::get_number_constant() { | |
string result = ""; | |
if (this->current_char == '0') { | |
if (this->peek() == 'x') { | |
while (this->current_char != '\0' && isdigit(this->current_char)) { | |
result += this->current_char; | |
this->advance(); | |
} | |
return l_token(BINARY_CONSTANT, result); | |
} | |
else if (this->peek() == 'x') { | |
while (this->current_char != '\0' && isdigit(this->current_char)) { | |
result += this->current_char; | |
this->advance(); | |
} | |
return l_token(HEX_CONSTANT, result); | |
} | |
} | |
while (this->current_char != '\0' && isdigit(this->current_char)) { | |
result += this->current_char; | |
this->advance(); | |
} | |
if (this->current_char == '.') { | |
result += this->current_char; | |
this->advance(); | |
while (this->current_char != '\0' && isdigit(this->current_char)) { | |
result += this->current_char; | |
this->advance(); | |
} | |
return l_token(FLOAT_CONSTANT, result); | |
} | |
return l_token(INTEGER_CONSTANT, result); | |
} | |
Token Lexer::get_string_constant() { | |
string result = "" + this->current_char; | |
this->advance(); | |
while (this->current_char != '\0' && this->current_char != '"') { | |
result += this->current_char; | |
this->advance(); | |
} | |
result += this->current_char; | |
this->advance(); | |
return l_token(STRING_CONSTANT, result); | |
} | |
Token Lexer::get_char_constant() { | |
string result = "" + this->current_char; | |
this->advance(); | |
while (this->current_char != '\0' && this->current_char != '\'') { | |
result += this->current_char; | |
this->advance(); | |
} | |
result += this->current_char; | |
this->advance(); | |
return l_token(CHAR_CONSTANT, result); | |
} | |
Token Lexer::get_identifier() { | |
string result = ""; | |
while (this->current_char != '\0' && isidchar(this->current_char)) { | |
result += this->current_char; | |
this->advance(); | |
} | |
for (vector<ct_pair>::iterator it = token_map.begin(); it != token_map.end(); ++it) { | |
if (it->second == result) | |
return l_token(it->first, result); | |
} | |
return l_token(IDENTIFIER, result); | |
} | |
Token Lexer::get_next_token() { | |
while (this->current_char != '\0') { | |
if (isspace(this->current_char)) { | |
this->skip_whitespace(); | |
continue; | |
} | |
else if (this->peek != '\0') { | |
} | |
} | |
} | |
#undef l_token |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment