dresswithpockets · October 22, 2016 00:08
diff --git a/lexer.h b/lexer.h

 #include "stdafx.h"

 using namespace std;

 enum Compiler_Token {
 	T_EOF				= 0x0, // don't wanna use 'EOF' since some headers have a #define for 'EOF'
 	IDENTIFIER			= 0x1,

 	FLOAT_CONSTANT		= 0x2,
 	INTEGER_CONSTANT	= 0x3,
 	BINARY_CONSTANT,
 	HEX_CONSTANT,
 	STRING_CONSTANT,
 	CHAR_CONSTANT,

 	PUBLIC,
 	PRIVATE,

 	CASE,
 	DEFAULT,
 	CONTINUE,
 	BREAK,
 	RETURN,

 	ANY,
 	VOID,
 	BOOL,
 	SIGNED8,
 	SIGNED16,
 	SIGNED32,
 	SIGNED64,
 	UNSIGNED8,
 	UNSIGNED16,
 	UNSIGNED32,
 	UNSIGNED64,
 	FLOAT32,
 	FLOAT64,
 	STRING,

 	T_NULL, // prefixed with T_ as `NULL` is #defined
 	TRUE,
 	FALSE,

 	STRUCT,
 	ENUM,
 	ALIAS,
 	FOREIGN,

 	USING,
 	NEW,
 	DELETE,
 	DEFER,

 	SWITCH,
 	IF,
 	ELSE,
 	FOR,
 	WHILE,
 	DO,

 	PLUS,
 	MINUS,
 	ASTERISK,
 	SLASH,
 	MODULO,
 	INCREMENT,
 	DECREMENT,
 	SHIFT_LEFT,
 	SHIFT_RIGHT,
 	LESS,
 	GREATER,
 	LESS_EQUAL,
 	GREATER_EQUAL,
 	EQUALITY,
 	NO_EQUALITY,
 	LOGICAL_NOT,
 	LOGICAL_AND,
 	LOGICAL_OR,
 	XOR,
 	NOT,
 	AND,
 	OR,
 	QUESTION,
 	COLON,
 	DOUBLE_COLON,
 	COLONASSIGN,
 	ASSIGN,
 	MUL_ASSIGN,
 	DIV_ASSIGN,
 	MOD_ASSIGN,
 	ADD_ASSIGN,
 	SUB_ASSIGN,
 	LSHIFT_ASSIGN,
 	RSHIFT_ASSIGN,
 	AND_ASSIGN,
 	XOR_ASSIGN,
 	OR_ASSIGN,

 	DOT,
 	DOUBLE_DOT,
 	ARROW,
 	SEMICOLON,
 	LPAREN,
 	RPAREN,
 	LSQUARE,
 	RSQUARE,
 	LBRACKET,
 	RBRACKET,
 	COMMA,
 	NOINIT
 };

 typedef pair<Compiler_Token, string> ct_pair;

 vector<ct_pair> token_map = {
 	{PUBLIC, "public"},
 	{PRIVATE, "private"},

 	{CASE, "case"},
 	{DEFAULT, "default"},
 	{CONTINUE, "continue"},
 	{BREAK, "break"},
 	{RETURN, "return"},

 	{ANY, "any"},
 	{VOID, "void"},
 	{BOOL, "bool"},
 	{SIGNED8, "s8"},
 	{SIGNED16, "s16"},
 	{SIGNED32, "s32"},
 	{SIGNED64, "s64"},
 	{UNSIGNED8, "u8"},
 	{UNSIGNED16, "u16"},
 	{UNSIGNED32, "u32"},
 	{UNSIGNED64, "u64"},
 	{FLOAT32, "f32"},
 	{FLOAT64, "f64"},
 	{STRING, "string"},

 	{T_NULL, "null"},
 	{TRUE, "true"},
 	{FALSE, "false"},

 	{STRUCT, "struct"},
 	{ENUM, "enum"},
 	{ALIAS, "alias"},
 	{FOREIGN, "foreign"},

 	{USING, "using"},
 	{NEW, "new"},
 	{DELETE, "delete"},
 	{DEFER, "defer"},

 	{SWITCH, "switch"},
 	{IF, "if"},
 	{ELSE, "else"},
 	{FOR, "for"},
 	{WHILE, "while"},
 	{DO, "do"},

 	{PLUS, "+"},
 	{MINUS, "-"},
 	{ASTERISK, "*"},
 	{SLASH, "/"},
 	{MODULO, "%"},
 	{INCREMENT, "++"},
 	{DECREMENT, "--"},
 	{LOGICAL_NOT, "!"},
 	{SHIFT_LEFT, "<<"},
 	{SHIFT_RIGHT, ">>"},
 	{LESS, "<"},
 	{GREATER, ">"},
 	{LESS_EQUAL, "<="},
 	{GREATER_EQUAL, ">="},
 	{EQUALITY, "=="},
 	{NO_EQUALITY, "!="},
 	{LOGICAL_NOT, "!"},
 	{LOGICAL_AND, "&&"},
 	{LOGICAL_OR, "||"},
 	{XOR, "^"},
 	{NOT, "~"},
 	{AND, "&"},
 	{OR, "|"},
 	{QUESTION, "?"},
 	{COLON, ":"},
 	{DOUBLE_COLON, "::"},
 	{COLONASSIGN, ":="},
 	{ASSIGN, "="},
 	{MUL_ASSIGN, "*="},
 	{DIV_ASSIGN, "/="},
 	{MOD_ASSIGN, "%="},
 	{ADD_ASSIGN, "+="},
 	{SUB_ASSIGN, "-="},
 	{LSHIFT_ASSIGN, "<<="},
 	{RSHIFT_ASSIGN, ">>="},
 	{AND_ASSIGN, "&="},
 	{XOR_ASSIGN, "^="},
 	{OR_ASSIGN, "|="},

 	{DOT, "."},
 	{DOUBLE_DOT, ".."},
 	{ARROW, "->"},
 	{SEMICOLON, ";"},
 	{LPAREN, "{"},
 	{RPAREN, "}"},
 	{LSQUARE, "["},
 	{RSQUARE, "]"},
 	{LBRACKET, "{"},
 	{RBRACKET, "}"},
 	{COMMA, ","},
 	{NOINIT, "---"}
 };

 struct alpha_length_compare {
 	bool operator()(const ct_pair& lhs, const ct_pair& rhs) {
 		return lhs.second.length() > rhs.second.length();
 	}
 };

 // size macros
 #define SIGNED8_MIN -128
 #define SIGNED8_MAX 127
 #define SIGNED16_MIN 32768
 #define SIGNED16_MAX -32768
 #define SIGNED32_MIN -2147483648
 #define SIGNED32_MAX 2147483647
 #define SIGNED64_MIN -9223372036854775808
 #define SIGNED64_MAX 9223372036854775807

 #define UNSIGNED_MIN 0
 #define UNSIGNED8_MAX 255
 #define UNSIGNED16_MAX 65535
 #define UNSIGNED32_MAX 4294967295
 #define UNSIGNED64_MAX 18446744073709551615

 #define FLOAT32_DIGIT_MIN 0
 #define FLOAT64_DIGIT_MIN 7

 const Compiler_Token Type_Specifiers[] = { VOID, BOOL, SIGNED8, SIGNED16, SIGNED32, SIGNED64, UNSIGNED8, UNSIGNED16, UNSIGNED32, UNSIGNED64, FLOAT32, FLOAT64, STRING, IDENTIFIER };
 const string identifier_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_";

 struct Token {
 	Compiler_Token type;
 	string value;
 	int line;
 	int character;

 	Token(Compiler_Token type, string value, int line, int character)
 		: type(type), value(value), line(line), character(character) {}
 	~Token() {}
 };

 string to_string(Token& token) {
 	return string("Token(") + to_string((int)token.type) +
 		", " + token.value + ") @ " + to_string(token.line) + ":" + to_string(token.character);
 }

 bool isidchar(char id) {
 	for (int i = 0; i < identifier_characters.length(); i++) {
 		if (identifier_characters[i] == id)
 			return true;
 	}
 	return false;
 }

 struct Lexer {
 	string unit;
 	int position;
 	char current_char;

 	int line_index;
 	int char_index;

 	Lexer(string unit)
 		: unit(unit), position(0), current_char(unit[0]), line_index(0), char_index(0) {
 		sort(token_map.begin(), token_map.end(), alpha_length_compare());
 	}
 	~Lexer() {}

 	void error(string message);
 	vector<Token> tokenize();
 	void advance(int index);
 	char peek(int index);
 	void skip_whitespace();
 	void skip_comment();
 	void skip_line();
 	Token get_number_constant();
 	Token get_string_constant();
 	Token get_char_constant();
 	Token get_identifier();
 	Token get_next_token();
 };

 void Lexer::error(string message) {
 	cout << "| " + to_string(this->line_index) + ":" + to_string(this->char_index) + "	error: " + message;
 }

 vector<Token> Lexer::tokenize() {
 	Token token = this->get_next_token();

 	vector<Token> tokens = { token };

 	while (token.type != T_EOF) {
 		token = this->get_next_token();
 		tokens.push_back(token);
 	}

 	return tokens;
 }

 void Lexer::advance(int index = 1) {
 	this->position += index;
 	if (this->position > this->unit.length() - 1) {
 		this->current_char = '\0';
 	}

 	this->current_char = this->unit[this->position];
 }

 char Lexer::peek(int index = 1) {
 	int peek_pos = this->position + index;
 	if (peek_pos > this->unit.length() - 1) {
 		return '\0';
 	}

 	return this->unit[peek_pos];
 }

 void Lexer::skip_whitespace() {
 	while (this->current_char != '\0' && isspace(this->current_char))
 		this->advance();
 }

 void Lexer::skip_comment() {
 	while (this->current_char != '\0' &&
 		(this->current_char != '*' && this->peek() != '/')) {
 		this->advance();
 	}
 	this->advance(2);
 }

 void Lexer::skip_line() {
 	while (this->current_char != '\0' && this->current_char != '\n')
 		this->advance();

 	this->advance();
 }

 #define l_token(a, b) Token(a, b, this->line_index, this->char_index)
 Token Lexer::get_number_constant() {
 	string result = "";

 	if (this->current_char == '0') {
 		if (this->peek() == 'x') {
 			while (this->current_char != '\0' && isdigit(this->current_char)) {
 				result += this->current_char;
 				this->advance();
 			}

 			return l_token(BINARY_CONSTANT, result);
 		}
 		else if (this->peek() == 'x') {
 			while (this->current_char != '\0' && isdigit(this->current_char)) {
 				result += this->current_char;
 				this->advance();
 			}

 			return l_token(HEX_CONSTANT, result);
 		}
 	}

 	while (this->current_char != '\0' && isdigit(this->current_char)) {
 		result += this->current_char;
 		this->advance();
 	}

 	if (this->current_char == '.') {
 		result += this->current_char;
 		this->advance();

 		while (this->current_char != '\0' && isdigit(this->current_char)) {
 			result += this->current_char;
 			this->advance();
 		}

 		return l_token(FLOAT_CONSTANT, result);
 	}

 	return l_token(INTEGER_CONSTANT, result);
 }

 Token Lexer::get_string_constant() {
 	string result = "" + this->current_char;
 	this->advance();

 	while (this->current_char != '\0' && this->current_char != '"') {
 		result += this->current_char;
 		this->advance();
 	}

 	result += this->current_char;
 	this->advance();

 	return l_token(STRING_CONSTANT, result);
 }

 Token Lexer::get_char_constant() {
 	string result = "" + this->current_char;
 	this->advance();

 	while (this->current_char != '\0' && this->current_char != '\'') {
 		result += this->current_char;
 		this->advance();
 	}

 	result += this->current_char;
 	this->advance();

 	return l_token(CHAR_CONSTANT, result);
 }

 Token Lexer::get_identifier() {
 	string result = "";

 	while (this->current_char != '\0' && isidchar(this->current_char)) {
 		result += this->current_char;
 		this->advance();
 	}

 	for (vector<ct_pair>::iterator it = token_map.begin(); it != token_map.end(); ++it) {
 		if (it->second == result)
 			return l_token(it->first, result);
 	}

 	return l_token(IDENTIFIER, result);
 }

 Token Lexer::get_next_token() {
 	while (this->current_char != '\0') {

 		if (isspace(this->current_char)) {
 			this->skip_whitespace();
 			continue;
 		}

 		else if (this->peek != '\0') {

 		}
 	}
 }
 #undef l_token

	#include "stdafx.h"

	using namespace std;

	enum Compiler_Token {
	T_EOF = 0x0, // don't wanna use 'EOF' since some headers have a #define for 'EOF'
	IDENTIFIER = 0x1,

	FLOAT_CONSTANT = 0x2,
	INTEGER_CONSTANT = 0x3,
	BINARY_CONSTANT,
	HEX_CONSTANT,
	STRING_CONSTANT,
	CHAR_CONSTANT,

	PUBLIC,
	PRIVATE,

	CASE,
	DEFAULT,
	CONTINUE,
	BREAK,
	RETURN,

	ANY,
	VOID,
	BOOL,
	SIGNED8,
	SIGNED16,
	SIGNED32,
	SIGNED64,
	UNSIGNED8,
	UNSIGNED16,
	UNSIGNED32,
	UNSIGNED64,
	FLOAT32,
	FLOAT64,
	STRING,

	T_NULL, // prefixed with T_ as `NULL` is #defined
	TRUE,
	FALSE,

	STRUCT,
	ENUM,
	ALIAS,
	FOREIGN,

	USING,
	NEW,
	DELETE,
	DEFER,

	SWITCH,
	IF,
	ELSE,
	FOR,
	WHILE,
	DO,

	PLUS,
	MINUS,
	ASTERISK,
	SLASH,
	MODULO,
	INCREMENT,
	DECREMENT,
	SHIFT_LEFT,
	SHIFT_RIGHT,
	LESS,
	GREATER,
	LESS_EQUAL,
	GREATER_EQUAL,
	EQUALITY,
	NO_EQUALITY,
	LOGICAL_NOT,
	LOGICAL_AND,
	LOGICAL_OR,
	XOR,
	NOT,
	AND,
	OR,
	QUESTION,
	COLON,
	DOUBLE_COLON,
	COLONASSIGN,
	ASSIGN,
	MUL_ASSIGN,
	DIV_ASSIGN,
	MOD_ASSIGN,
	ADD_ASSIGN,
	SUB_ASSIGN,
	LSHIFT_ASSIGN,
	RSHIFT_ASSIGN,
	AND_ASSIGN,
	XOR_ASSIGN,
	OR_ASSIGN,

	DOT,
	DOUBLE_DOT,
	ARROW,
	SEMICOLON,
	LPAREN,
	RPAREN,
	LSQUARE,
	RSQUARE,
	LBRACKET,
	RBRACKET,
	COMMA,
	NOINIT
	};

	typedef pair<Compiler_Token, string> ct_pair;

	vector<ct_pair> token_map = {
	{PUBLIC, "public"},
	{PRIVATE, "private"},

	{CASE, "case"},
	{DEFAULT, "default"},
	{CONTINUE, "continue"},
	{BREAK, "break"},
	{RETURN, "return"},

	{ANY, "any"},
	{VOID, "void"},
	{BOOL, "bool"},
	{SIGNED8, "s8"},
	{SIGNED16, "s16"},
	{SIGNED32, "s32"},
	{SIGNED64, "s64"},
	{UNSIGNED8, "u8"},
	{UNSIGNED16, "u16"},
	{UNSIGNED32, "u32"},
	{UNSIGNED64, "u64"},
	{FLOAT32, "f32"},
	{FLOAT64, "f64"},
	{STRING, "string"},

	{T_NULL, "null"},
	{TRUE, "true"},
	{FALSE, "false"},

	{STRUCT, "struct"},
	{ENUM, "enum"},
	{ALIAS, "alias"},
	{FOREIGN, "foreign"},

	{USING, "using"},
	{NEW, "new"},
	{DELETE, "delete"},
	{DEFER, "defer"},

	{SWITCH, "switch"},
	{IF, "if"},
	{ELSE, "else"},
	{FOR, "for"},
	{WHILE, "while"},
	{DO, "do"},

	{PLUS, "+"},
	{MINUS, "-"},
	{ASTERISK, "*"},
	{SLASH, "/"},
	{MODULO, "%"},
	{INCREMENT, "++"},
	{DECREMENT, "--"},
	{LOGICAL_NOT, "!"},
	{SHIFT_LEFT, "<<"},
	{SHIFT_RIGHT, ">>"},
	{LESS, "<"},
	{GREATER, ">"},
	{LESS_EQUAL, "<="},
	{GREATER_EQUAL, ">="},
	{EQUALITY, "=="},
	{NO_EQUALITY, "!="},
	{LOGICAL_NOT, "!"},
	{LOGICAL_AND, "&&"},
	{LOGICAL_OR, "\|\|"},
	{XOR, "^"},
	{NOT, "~"},
	{AND, "&"},
	{OR, "\|"},
	{QUESTION, "?"},
	{COLON, ":"},
	{DOUBLE_COLON, "::"},
	{COLONASSIGN, ":="},
	{ASSIGN, "="},
	{MUL_ASSIGN, "*="},
	{DIV_ASSIGN, "/="},
	{MOD_ASSIGN, "%="},
	{ADD_ASSIGN, "+="},
	{SUB_ASSIGN, "-="},
	{LSHIFT_ASSIGN, "<<="},
	{RSHIFT_ASSIGN, ">>="},
	{AND_ASSIGN, "&="},
	{XOR_ASSIGN, "^="},
	{OR_ASSIGN, "\|="},

	{DOT, "."},
	{DOUBLE_DOT, ".."},
	{ARROW, "->"},
	{SEMICOLON, ";"},
	{LPAREN, "{"},
	{RPAREN, "}"},
	{LSQUARE, "["},
	{RSQUARE, "]"},
	{LBRACKET, "{"},
	{RBRACKET, "}"},
	{COMMA, ","},
	{NOINIT, "---"}
	};

	struct alpha_length_compare {
	bool operator()(const ct_pair& lhs, const ct_pair& rhs) {
	return lhs.second.length() > rhs.second.length();
	}
	};

	// size macros
	#define SIGNED8_MIN -128
	#define SIGNED8_MAX 127
	#define SIGNED16_MIN 32768
	#define SIGNED16_MAX -32768
	#define SIGNED32_MIN -2147483648
	#define SIGNED32_MAX 2147483647
	#define SIGNED64_MIN -9223372036854775808
	#define SIGNED64_MAX 9223372036854775807

	#define UNSIGNED_MIN 0
	#define UNSIGNED8_MAX 255
	#define UNSIGNED16_MAX 65535
	#define UNSIGNED32_MAX 4294967295
	#define UNSIGNED64_MAX 18446744073709551615

	#define FLOAT32_DIGIT_MIN 0
	#define FLOAT64_DIGIT_MIN 7

	const Compiler_Token Type_Specifiers[] = { VOID, BOOL, SIGNED8, SIGNED16, SIGNED32, SIGNED64, UNSIGNED8, UNSIGNED16, UNSIGNED32, UNSIGNED64, FLOAT32, FLOAT64, STRING, IDENTIFIER };
	const string identifier_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890_";

	struct Token {
	Compiler_Token type;
	string value;
	int line;
	int character;

	Token(Compiler_Token type, string value, int line, int character)
	: type(type), value(value), line(line), character(character) {}
	~Token() {}
	};

	string to_string(Token& token) {
	return string("Token(") + to_string((int)token.type) +
	", " + token.value + ") @ " + to_string(token.line) + ":" + to_string(token.character);
	}

	bool isidchar(char id) {
	for (int i = 0; i < identifier_characters.length(); i++) {
	if (identifier_characters[i] == id)
	return true;
	}
	return false;
	}

	struct Lexer {
	string unit;
	int position;
	char current_char;

	int line_index;
	int char_index;

	Lexer(string unit)
	: unit(unit), position(0), current_char(unit[0]), line_index(0), char_index(0) {
	sort(token_map.begin(), token_map.end(), alpha_length_compare());
	}
	~Lexer() {}

	void error(string message);
	vector<Token> tokenize();
	void advance(int index);
	char peek(int index);
	void skip_whitespace();
	void skip_comment();
	void skip_line();
	Token get_number_constant();
	Token get_string_constant();
	Token get_char_constant();
	Token get_identifier();
	Token get_next_token();
	};

	void Lexer::error(string message) {
	cout << "\| " + to_string(this->line_index) + ":" + to_string(this->char_index) + " error: " + message;
	}

	vector<Token> Lexer::tokenize() {
	Token token = this->get_next_token();

	vector<Token> tokens = { token };

	while (token.type != T_EOF) {
	token = this->get_next_token();
	tokens.push_back(token);
	}

	return tokens;
	}

	void Lexer::advance(int index = 1) {
	this->position += index;
	if (this->position > this->unit.length() - 1) {
	this->current_char = '\0';
	}

	this->current_char = this->unit[this->position];
	}

	char Lexer::peek(int index = 1) {
	int peek_pos = this->position + index;
	if (peek_pos > this->unit.length() - 1) {
	return '\0';
	}

	return this->unit[peek_pos];
	}

	void Lexer::skip_whitespace() {
	while (this->current_char != '\0' && isspace(this->current_char))
	this->advance();
	}

	void Lexer::skip_comment() {
	while (this->current_char != '\0' &&
	(this->current_char != '*' && this->peek() != '/')) {
	this->advance();
	}
	this->advance(2);
	}

	void Lexer::skip_line() {
	while (this->current_char != '\0' && this->current_char != '\n')
	this->advance();

	this->advance();
	}

	#define l_token(a, b) Token(a, b, this->line_index, this->char_index)
	Token Lexer::get_number_constant() {
	string result = "";

	if (this->current_char == '0') {
	if (this->peek() == 'x') {
	while (this->current_char != '\0' && isdigit(this->current_char)) {
	result += this->current_char;
	this->advance();
	}

	return l_token(BINARY_CONSTANT, result);
	}
	else if (this->peek() == 'x') {
	while (this->current_char != '\0' && isdigit(this->current_char)) {
	result += this->current_char;
	this->advance();
	}

	return l_token(HEX_CONSTANT, result);
	}
	}

	while (this->current_char != '\0' && isdigit(this->current_char)) {
	result += this->current_char;
	this->advance();
	}

	if (this->current_char == '.') {
	result += this->current_char;
	this->advance();

	while (this->current_char != '\0' && isdigit(this->current_char)) {
	result += this->current_char;
	this->advance();
	}

	return l_token(FLOAT_CONSTANT, result);
	}

	return l_token(INTEGER_CONSTANT, result);
	}

	Token Lexer::get_string_constant() {
	string result = "" + this->current_char;
	this->advance();

	while (this->current_char != '\0' && this->current_char != '"') {
	result += this->current_char;
	this->advance();
	}

	result += this->current_char;
	this->advance();

	return l_token(STRING_CONSTANT, result);
	}

	Token Lexer::get_char_constant() {
	string result = "" + this->current_char;
	this->advance();

	while (this->current_char != '\0' && this->current_char != '\'') {
	result += this->current_char;
	this->advance();
	}

	result += this->current_char;
	this->advance();

	return l_token(CHAR_CONSTANT, result);
	}

	Token Lexer::get_identifier() {
	string result = "";

	while (this->current_char != '\0' && isidchar(this->current_char)) {
	result += this->current_char;
	this->advance();
	}

	for (vector<ct_pair>::iterator it = token_map.begin(); it != token_map.end(); ++it) {
	if (it->second == result)
	return l_token(it->first, result);
	}

	return l_token(IDENTIFIER, result);
	}

	Token Lexer::get_next_token() {
	while (this->current_char != '\0') {

	if (isspace(this->current_char)) {
	this->skip_whitespace();
	continue;
	}

	else if (this->peek != '\0') {

	}
	}
	}
	#undef l_token