Created
December 26, 2024 20:57
-
-
Save darilrt/7d70586516169df617bf150b53877a71 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "lexer/lexer.hpp" | |
| const std::string keywords[] = {"if", "else", "while", "for", "ret", "break", "continue", "true", "false", "null", "def", "interface", "extern"}; | |
| char Lexer::peek() const | |
| { | |
| return source[index]; | |
| } | |
| void Lexer::advance() | |
| { | |
| if (index < source.size()) | |
| { | |
| if (source[index] == '\n') | |
| { | |
| location.line++; | |
| location.column = 1; | |
| } | |
| else | |
| { | |
| location.column++; | |
| } | |
| index++; | |
| } | |
| } | |
| void Lexer::skip_whitespace() | |
| { | |
| while (index < source.size() && std::isspace(peek())) | |
| { | |
| advance(); | |
| } | |
| } | |
| Token Lexer::next_token() | |
| { | |
| skip_whitespace(); | |
| if (index >= source.size()) | |
| { | |
| return Token{Token::Type::EndOfFile, "", location}; | |
| } | |
| char p = peek(); | |
| if (std::isalpha(p) || p == '_') | |
| { | |
| return lex_identifier(); | |
| } | |
| else if (std::isdigit(p)) | |
| { | |
| return lex_number(); | |
| } | |
| else if (p == '"') | |
| { | |
| return lex_string(); | |
| } | |
| else if (p == '\'') | |
| { | |
| return lex_char(); | |
| } | |
| advance(); | |
| if (p == '(' || p == ')' || p == '{' || p == '}' || p == '[' || p == ']' || p == ',' || p == ';' || p == ':') | |
| { | |
| return Token{Token::Type::Punctuation, std::string(1, p), location}; | |
| } | |
| switch (p) | |
| { | |
| case '(': | |
| case ')': | |
| case '{': | |
| case '}': | |
| case '[': | |
| case ']': | |
| case ',': | |
| case '.': | |
| case ';': | |
| case ':': | |
| return Token{Token::Type::Punctuation, std::string(1, p), location}; | |
| case '+': | |
| case '-': | |
| if (peek() == '>') | |
| { | |
| advance(); | |
| return Token{Token::Type::Operator, "->", location}; | |
| } | |
| case '*': | |
| case '/': | |
| case '%': | |
| case '=': | |
| case '!': | |
| case '<': | |
| case '>': | |
| case '&': | |
| case '|': | |
| if (peek() == '=') | |
| { | |
| advance(); | |
| return Token{Token::Type::Operator, std::string(1, p) + '=', location}; | |
| } | |
| return Token{Token::Type::Operator, std::string(1, p), location}; | |
| } | |
| advance(); | |
| return Token{Token::Type::Unknown, std::string(1, p), location}; | |
| } | |
| Token Lexer::lex_identifier() | |
| { | |
| std::string lexeme; | |
| auto p = peek(); | |
| while (std::isalnum(p) || p == '_') | |
| { | |
| lexeme += p; | |
| advance(); | |
| p = peek(); | |
| } | |
| for (const auto &keyword : keywords) | |
| { | |
| if (lexeme == keyword) | |
| { | |
| return Token{Token::Type::Keyword, lexeme, location}; | |
| } | |
| } | |
| return Token{Token::Type::Identifier, lexeme, location}; | |
| } | |
| Token Lexer::lex_number() | |
| { | |
| std::string lexeme; | |
| auto p = peek(); | |
| while (std::isdigit(p)) | |
| { | |
| lexeme += p; | |
| advance(); | |
| p = peek(); | |
| } | |
| if (p != '.') | |
| { | |
| return Token{Token::Type::Integer, lexeme, location}; | |
| } | |
| lexeme += p; | |
| advance(); | |
| p = peek(); | |
| while (std::isdigit(p)) | |
| { | |
| lexeme += p; | |
| advance(); | |
| p = peek(); | |
| } | |
| return Token{Token::Type::Float, lexeme, location}; | |
| } | |
| Token Lexer::lex_string() | |
| { | |
| std::string lexeme; | |
| advance(); | |
| auto p = peek(); | |
| while (p != '"') | |
| { | |
| lexeme += p; | |
| advance(); | |
| p = peek(); | |
| } | |
| advance(); | |
| return Token{Token::Type::String, lexeme, location}; | |
| } | |
| Token Lexer::lex_char() | |
| { | |
| std::string lexeme; | |
| advance(); | |
| auto p = peek(); | |
| lexeme += p; | |
| advance(); | |
| if (peek() != '\'') | |
| { | |
| return Token{Token::Type::Unknown, lexeme, location}; | |
| } | |
| advance(); | |
| return Token{Token::Type::Char, lexeme, location}; | |
| } | |
| std::vector<Token> lex(const std::string &source, const std::string &file_name) | |
| { | |
| Lexer lexer(source, file_name); | |
| std::vector<Token> tokens; | |
| while (true) | |
| { | |
| auto token = lexer.next_token(); | |
| if (token.type == Token::Type::EndOfFile) | |
| { | |
| break; | |
| } | |
| tokens.push_back(token); | |
| } | |
| return tokens; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #pragma once | |
| #include <string> | |
| #include <vector> | |
| #include "lexer/token.hpp" | |
| class Lexer | |
| { | |
| public: | |
| std::string file_name; | |
| std::string source; | |
| std::vector<Token> tokens; | |
| Location location; | |
| int index; | |
| Lexer(const std::string &source, const std::string &file_name = "<stdio>") | |
| : file_name(file_name), source(source), index(0), location({1, 1, file_name}) {} | |
| char peek() const; | |
| void advance(); | |
| void skip_whitespace(); | |
| Token next_token(); | |
| Token lex_identifier(); | |
| Token lex_number(); | |
| Token lex_string(); | |
| Token lex_char(); | |
| }; | |
| std::vector<Token> lex(const std::string &source, const std::string &file_name = "<stdio>"); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #pragma once | |
| #include <string> | |
| #include <iostream> | |
| class Location | |
| { | |
| public: | |
| int line; | |
| int column; | |
| std::string file; | |
| Location copy() const | |
| { | |
| return {line, column, file}; | |
| } | |
| friend std::ostream &operator<<(std::ostream &os, const Location &location) | |
| { | |
| os << "Location {" << "line=" << location.line << ", " | |
| << "column=" << location.column << ", " | |
| << "file='" << location.file << "'" | |
| << "}"; | |
| return os; | |
| } | |
| }; | |
| class Token | |
| { | |
| public: | |
| enum class Type | |
| { | |
| Identifier, // Variable name | |
| Integer, // Integer number | |
| Float, // Floating point number | |
| String, // String "Hello, World!" | |
| Char, // Character 'c' | |
| Operator, // Operator like +, -, *, /, etc. | |
| Punctuation, // Punctuation like ;, (, ), etc. | |
| Keyword, // Keyword like if, else, while, etc. | |
| EndOfFile, // End of file | |
| Unknown // Unknown token | |
| } type; | |
| std::string lexeme; | |
| Location location; | |
| friend std::ostream &operator<<(std::ostream &os, const Token &token) | |
| { | |
| std::string type; | |
| switch (token.type) | |
| { | |
| case Type::Identifier: | |
| type = "Identifier"; | |
| break; | |
| case Type::Integer: | |
| type = "Integer"; | |
| break; | |
| case Type::Float: | |
| type = "Float"; | |
| break; | |
| case Type::String: | |
| type = "String"; | |
| break; | |
| case Type::Char: | |
| type = "Char"; | |
| break; | |
| case Type::Operator: | |
| type = "Operator"; | |
| break; | |
| case Type::Punctuation: | |
| type = "Punctuation"; | |
| break; | |
| case Type::Keyword: | |
| type = "Keyword"; | |
| break; | |
| case Type::EndOfFile: | |
| type = "EndOfFile"; | |
| break; | |
| case Type::Unknown: | |
| type = "Unknown"; | |
| break; | |
| } | |
| os << "Token {" | |
| << "type=" << type << ", " | |
| << "lexeme='" << token.lexeme << "', " | |
| << "location=" << token.location << "}"; | |
| return os; | |
| } | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment