Skip to content

Instantly share code, notes, and snippets.

@dresswithpockets
Created October 22, 2016 00:08
Show Gist options
  • Save dresswithpockets/ac382e5c9a06d3a583b8554e43252de2 to your computer and use it in GitHub Desktop.
Save dresswithpockets/ac382e5c9a06d3a583b8554e43252de2 to your computer and use it in GitHub Desktop.
stretch lexer
#include "stdafx.h"
using namespace std;
enum Compiler_Token {
T_EOF = 0x0, // don't wanna use 'EOF' since some headers have a #define for 'EOF'
IDENTIFIER = 0x1,
FLOAT_CONSTANT = 0x2,
INTEGER_CONSTANT = 0x3,
BINARY_CONSTANT,
HEX_CONSTANT,
STRING_CONSTANT,
CHAR_CONSTANT,
PUBLIC,
PRIVATE,
CASE,
DEFAULT,
CONTINUE,
BREAK,
RETURN,
ANY,
VOID,
BOOL,
SIGNED8,
SIGNED16,
SIGNED32,
SIGNED64,
UNSIGNED8,
UNSIGNED16,
UNSIGNED32,
UNSIGNED64,
FLOAT32,
FLOAT64,
STRING,
T_NULL, // prefixed with T_ as `NULL` is #defined
TRUE,
FALSE,
STRUCT,
ENUM,
ALIAS,
FOREIGN,
USING,
NEW,
DELETE,
DEFER,
SWITCH,
IF,
ELSE,
FOR,
WHILE,
DO,
PLUS,
MINUS,
ASTERISK,
SLASH,
MODULO,
INCREMENT,
DECREMENT,
SHIFT_LEFT,
SHIFT_RIGHT,
LESS,
GREATER,
LESS_EQUAL,
GREATER_EQUAL,
EQUALITY,
NO_EQUALITY,
LOGICAL_NOT,
LOGICAL_AND,
LOGICAL_OR,
XOR,
NOT,
AND,
OR,
QUESTION,
COLON,
DOUBLE_COLON,
COLONASSIGN,
ASSIGN,
MUL_ASSIGN,
DIV_ASSIGN,
MOD_ASSIGN,
ADD_ASSIGN,
SUB_ASSIGN,
LSHIFT_ASSIGN,
RSHIFT_ASSIGN,
AND_ASSIGN,
XOR_ASSIGN,
OR_ASSIGN,
DOT,
DOUBLE_DOT,
ARROW,
SEMICOLON,
LPAREN,
RPAREN,
LSQUARE,
RSQUARE,
LBRACKET,
RBRACKET,
COMMA,
NOINIT
};
typedef pair<Compiler_Token, string> ct_pair;
vector<ct_pair> token_map = {
{PUBLIC, "public"},
{PRIVATE, "private"},
{CASE, "case"},
{DEFAULT, "default"},
{CONTINUE, "continue"},
{BREAK, "break"},
{RETURN, "return"},
{ANY, "any"},
{VOID, "void"},
{BOOL, "bool"},
{SIGNED8, "s8"},
{SIGNED16, "s16"},
{SIGNED32, "s32"},
{SIGNED64, "s64"},
{UNSIGNED8, "u8"},
{UNSIGNED16, "u16"},
{UNSIGNED32, "u32"},
{UNSIGNED64, "u64"},
{FLOAT32, "f32"},
{FLOAT64, "f64"},
{STRING, "string"},
{T_NULL, "null"},
{TRUE, "true"},
{FALSE, "false"},
{STRUCT, "struct"},
{ENUM, "enum"},
{ALIAS, "alias"},
{FOREIGN, "foreign"},
{USING, "using"},
{NEW, "new"},
{DELETE, "delete"},
{DEFER, "defer"},
{SWITCH, "switch"},
{IF, "if"},
{ELSE, "else"},
{FOR, "for"},
{WHILE, "while"},
{DO, "do"},
{PLUS, "+"},
{MINUS, "-"},
{ASTERISK, "*"},
{SLASH, "/"},
{MODULO, "%"},
{INCREMENT, "++"},
{DECREMENT, "--"},
{LOGICAL_NOT, "!"},
{SHIFT_LEFT, "<<"},
{SHIFT_RIGHT, ">>"},
{LESS, "<"},
{GREATER, ">"},
{LESS_EQUAL, "<="},
{GREATER_EQUAL, ">="},
{EQUALITY, "=="},
{NO_EQUALITY, "!="},
{LOGICAL_NOT, "!"},
{LOGICAL_AND, "&&"},
{LOGICAL_OR, "||"},
{XOR, "^"},
{NOT, "~"},
{AND, "&"},
{OR, "|"},
{QUESTION, "?"},
{COLON, ":"},
{DOUBLE_COLON, "::"},
{COLONASSIGN, ":="},
{ASSIGN, "="},
{MUL_ASSIGN, "*="},
{DIV_ASSIGN, "/="},
{MOD_ASSIGN, "%="},
{ADD_ASSIGN, "+="},
{SUB_ASSIGN, "-="},
{LSHIFT_ASSIGN, "<<="},
{RSHIFT_ASSIGN, ">>="},
{AND_ASSIGN, "&="},
{XOR_ASSIGN, "^="},
{OR_ASSIGN, "|="},
{DOT, "."},
{DOUBLE_DOT, ".."},
{ARROW, "->"},
{SEMICOLON, ";"},
{LPAREN, "{"},
{RPAREN, "}"},
{LSQUARE, "["},
{RSQUARE, "]"},
{LBRACKET, "{"},
{RBRACKET, "}"},
{COMMA, ","},
{NOINIT, "---"}
};
struct alpha_length_compare {
bool operator()(const ct_pair& lhs, const ct_pair& rhs) {
return lhs.second.length() > rhs.second.length();
}
};
// size macros
#define SIGNED8_MIN -128
#define SIGNED8_MAX 127
#define SIGNED16_MIN 32768
#define SIGNED16_MAX -32768
#define SIGNED32_MIN -2147483648
#define SIGNED32_MAX 2147483647
#define SIGNED64_MIN -9223372036854775808
#define SIGNED64_MAX 9223372036854775807
#define UNSIGNED_MIN 0
#define UNSIGNED8_MAX 255
#define UNSIGNED16_MAX 65535
#define UNSIGNED32_MAX 4294967295
#define UNSIGNED64_MAX 18446744073709551615
#define FLOAT32_DIGIT_MIN 0
#define FLOAT64_DIGIT_MIN 7
const Compiler_Token Type_Specifiers[] = { VOID, BOOL, SIGNED8, SIGNED16, SIGNED32, SIGNED64, UNSIGNED8, UNSIGNED16, UNSIGNED32, UNSIGNED64, FLOAT32, FLOAT64, STRING, IDENTIFIER };
const string identifier_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_";
struct Token {
Compiler_Token type;
string value;
int line;
int character;
Token(Compiler_Token type, string value, int line, int character)
: type(type), value(value), line(line), character(character) {}
~Token() {}
};
string to_string(Token& token) {
return string("Token(") + to_string((int)token.type) +
", " + token.value + ") @ " + to_string(token.line) + ":" + to_string(token.character);
}
bool isidchar(char id) {
for (int i = 0; i < identifier_characters.length(); i++) {
if (identifier_characters[i] == id)
return true;
}
return false;
}
struct Lexer {
string unit;
int position;
char current_char;
int line_index;
int char_index;
Lexer(string unit)
: unit(unit), position(0), current_char(unit[0]), line_index(0), char_index(0) {
sort(token_map.begin(), token_map.end(), alpha_length_compare());
}
~Lexer() {}
void error(string message);
vector<Token> tokenize();
void advance(int index);
char peek(int index);
void skip_whitespace();
void skip_comment();
void skip_line();
Token get_number_constant();
Token get_string_constant();
Token get_char_constant();
Token get_identifier();
Token get_next_token();
};
void Lexer::error(string message) {
cout << "| " + to_string(this->line_index) + ":" + to_string(this->char_index) + " error: " + message;
}
vector<Token> Lexer::tokenize() {
Token token = this->get_next_token();
vector<Token> tokens = { token };
while (token.type != T_EOF) {
token = this->get_next_token();
tokens.push_back(token);
}
return tokens;
}
void Lexer::advance(int index = 1) {
this->position += index;
if (this->position > this->unit.length() - 1) {
this->current_char = '\0';
}
this->current_char = this->unit[this->position];
}
char Lexer::peek(int index = 1) {
int peek_pos = this->position + index;
if (peek_pos > this->unit.length() - 1) {
return '\0';
}
return this->unit[peek_pos];
}
void Lexer::skip_whitespace() {
while (this->current_char != '\0' && isspace(this->current_char))
this->advance();
}
void Lexer::skip_comment() {
while (this->current_char != '\0' &&
(this->current_char != '*' && this->peek() != '/')) {
this->advance();
}
this->advance(2);
}
void Lexer::skip_line() {
while (this->current_char != '\0' && this->current_char != '\n')
this->advance();
this->advance();
}
#define l_token(a, b) Token(a, b, this->line_index, this->char_index)
Token Lexer::get_number_constant() {
string result = "";
if (this->current_char == '0') {
if (this->peek() == 'x') {
while (this->current_char != '\0' && isdigit(this->current_char)) {
result += this->current_char;
this->advance();
}
return l_token(BINARY_CONSTANT, result);
}
else if (this->peek() == 'x') {
while (this->current_char != '\0' && isdigit(this->current_char)) {
result += this->current_char;
this->advance();
}
return l_token(HEX_CONSTANT, result);
}
}
while (this->current_char != '\0' && isdigit(this->current_char)) {
result += this->current_char;
this->advance();
}
if (this->current_char == '.') {
result += this->current_char;
this->advance();
while (this->current_char != '\0' && isdigit(this->current_char)) {
result += this->current_char;
this->advance();
}
return l_token(FLOAT_CONSTANT, result);
}
return l_token(INTEGER_CONSTANT, result);
}
Token Lexer::get_string_constant() {
string result = "" + this->current_char;
this->advance();
while (this->current_char != '\0' && this->current_char != '"') {
result += this->current_char;
this->advance();
}
result += this->current_char;
this->advance();
return l_token(STRING_CONSTANT, result);
}
Token Lexer::get_char_constant() {
string result = "" + this->current_char;
this->advance();
while (this->current_char != '\0' && this->current_char != '\'') {
result += this->current_char;
this->advance();
}
result += this->current_char;
this->advance();
return l_token(CHAR_CONSTANT, result);
}
Token Lexer::get_identifier() {
string result = "";
while (this->current_char != '\0' && isidchar(this->current_char)) {
result += this->current_char;
this->advance();
}
for (vector<ct_pair>::iterator it = token_map.begin(); it != token_map.end(); ++it) {
if (it->second == result)
return l_token(it->first, result);
}
return l_token(IDENTIFIER, result);
}
Token Lexer::get_next_token() {
while (this->current_char != '\0') {
if (isspace(this->current_char)) {
this->skip_whitespace();
continue;
}
else if (this->peek != '\0') {
}
}
}
#undef l_token
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment