darilrt · December 26, 2024 20:57
diff --git a/lexer.cpp b/lexer.cpp

 #include "lexer/lexer.hpp"

 const std::string keywords[] = {"if", "else", "while", "for", "ret", "break", "continue", "true", "false", "null", "def", "interface", "extern"};

 char Lexer::peek() const
 {
    return source[index];
 }

 void Lexer::advance()
 {
    if (index < source.size())
    {
        if (source[index] == '\n')
        {
            location.line++;
            location.column = 1;
        }
        else
        {
            location.column++;
        }

        index++;
    }
 }

 void Lexer::skip_whitespace()
 {
    while (index < source.size() && std::isspace(peek()))
    {
        advance();
    }
 }

 Token Lexer::next_token()
 {
    skip_whitespace();

    if (index >= source.size())
    {
        return Token{Token::Type::EndOfFile, "", location};
    }

    char p = peek();

    if (std::isalpha(p) || p == '_')
    {
        return lex_identifier();
    }
    else if (std::isdigit(p))
    {
        return lex_number();
    }
    else if (p == '"')
    {
        return lex_string();
    }
    else if (p == '\'')
    {
        return lex_char();
    }

    advance();

    if (p == '(' || p == ')' || p == '{' || p == '}' || p == '[' || p == ']' || p == ',' || p == ';' || p == ':')
    {
        return Token{Token::Type::Punctuation, std::string(1, p), location};
    }

    switch (p)
    {
    case '(':
    case ')':
    case '{':
    case '}':
    case '[':
    case ']':
    case ',':
    case '.':
    case ';':
    case ':':
        return Token{Token::Type::Punctuation, std::string(1, p), location};

    case '+':
    case '-':
        if (peek() == '>')
        {
            advance();
            return Token{Token::Type::Operator, "->", location};
        }

    case '*':
    case '/':
    case '%':
    case '=':
    case '!':
    case '<':
    case '>':
    case '&':
    case '|':
        if (peek() == '=')
        {
            advance();
            return Token{Token::Type::Operator, std::string(1, p) + '=', location};
        }

        return Token{Token::Type::Operator, std::string(1, p), location};
    }

    advance();
    return Token{Token::Type::Unknown, std::string(1, p), location};
 }

 Token Lexer::lex_identifier()
 {
    std::string lexeme;

    auto p = peek();

    while (std::isalnum(p) || p == '_')
    {
        lexeme += p;
        advance();
        p = peek();
    }

    for (const auto &keyword : keywords)
    {
        if (lexeme == keyword)
        {
            return Token{Token::Type::Keyword, lexeme, location};
        }
    }

    return Token{Token::Type::Identifier, lexeme, location};
 }

 Token Lexer::lex_number()
 {
    std::string lexeme;

    auto p = peek();

    while (std::isdigit(p))
    {
        lexeme += p;
        advance();
        p = peek();
    }

    if (p != '.')
    {
        return Token{Token::Type::Integer, lexeme, location};
    }

    lexeme += p;
    advance();

    p = peek();

    while (std::isdigit(p))
    {
        lexeme += p;
        advance();
        p = peek();
    }

    return Token{Token::Type::Float, lexeme, location};
 }

 Token Lexer::lex_string()
 {
    std::string lexeme;

    advance();

    auto p = peek();

    while (p != '"')
    {
        lexeme += p;
        advance();
        p = peek();
    }

    advance();

    return Token{Token::Type::String, lexeme, location};
 }

 Token Lexer::lex_char()
 {
    std::string lexeme;

    advance();

    auto p = peek();

    lexeme += p;
    advance();

    if (peek() != '\'')
    {
        return Token{Token::Type::Unknown, lexeme, location};
    }

    advance();

    return Token{Token::Type::Char, lexeme, location};
 }

 std::vector<Token> lex(const std::string &source, const std::string &file_name)
 {
    Lexer lexer(source, file_name);
    std::vector<Token> tokens;

    while (true)
    {
        auto token = lexer.next_token();

        if (token.type == Token::Type::EndOfFile)
        {
            break;
        }

        tokens.push_back(token);
    }

    return tokens;
 }
diff --git a/lexer.hpp b/lexer.hpp
 #pragma once

 #include <string>
 #include <vector>

 #include "lexer/token.hpp"

 class Lexer
 {
 public:
    std::string file_name;
    std::string source;
    std::vector<Token> tokens;
    Location location;
    int index;

    Lexer(const std::string &source, const std::string &file_name = "<stdio>")
        : file_name(file_name), source(source), index(0), location({1, 1, file_name}) {}

    char peek() const;

    void advance();

    void skip_whitespace();

    Token next_token();

    Token lex_identifier();

    Token lex_number();

    Token lex_string();

    Token lex_char();
 };

 std::vector<Token> lex(const std::string &source, const std::string &file_name = "<stdio>");
diff --git a/token.hpp b/token.hpp
 #pragma once

 #include <string>
 #include <iostream>

 class Location
 {
 public:
    int line;
    int column;
    std::string file;

    Location copy() const
    {
        return {line, column, file};
    }

    friend std::ostream &operator<<(std::ostream &os, const Location &location)
    {
        os << "Location {" << "line=" << location.line << ", "
           << "column=" << location.column << ", "
           << "file='" << location.file << "'"
           << "}";
        return os;
    }
 };

 class Token
 {
 public:
    enum class Type
    {
        Identifier,  // Variable name
        Integer,     // Integer number
        Float,       // Floating point number
        String,      // String "Hello, World!"
        Char,        // Character 'c'
        Operator,    // Operator like +, -, *, /, etc.
        Punctuation, // Punctuation like ;, (, ), etc.
        Keyword,     // Keyword like if, else, while, etc.
        EndOfFile,   // End of file
        Unknown      // Unknown token
    } type;
    std::string lexeme;
    Location location;

    friend std::ostream &operator<<(std::ostream &os, const Token &token)
    {
        std::string type;

        switch (token.type)
        {
        case Type::Identifier:
            type = "Identifier";
            break;
        case Type::Integer:
            type = "Integer";
            break;
        case Type::Float:
            type = "Float";
            break;
        case Type::String:
            type = "String";
            break;
        case Type::Char:
            type = "Char";
            break;
        case Type::Operator:
            type = "Operator";
            break;
        case Type::Punctuation:
            type = "Punctuation";
            break;
        case Type::Keyword:
            type = "Keyword";
            break;
        case Type::EndOfFile:
            type = "EndOfFile";
            break;
        case Type::Unknown:
            type = "Unknown";
            break;
        }

        os << "Token {"
           << "type=" << type << ", "
           << "lexeme='" << token.lexeme << "', "
           << "location=" << token.location << "}";
        return os;
    }
 };

	#include "lexer/lexer.hpp"

	const std::string keywords[] = {"if", "else", "while", "for", "ret", "break", "continue", "true", "false", "null", "def", "interface", "extern"};

	char Lexer::peek() const
	{
	return source[index];
	}

	void Lexer::advance()
	{
	if (index < source.size())
	{
	if (source[index] == '\n')
	{
	location.line++;
	location.column = 1;
	}
	else
	{
	location.column++;
	}

	index++;
	}
	}

	void Lexer::skip_whitespace()
	{
	while (index < source.size() && std::isspace(peek()))
	{
	advance();
	}
	}

	Token Lexer::next_token()
	{
	skip_whitespace();

	if (index >= source.size())
	{
	return Token{Token::Type::EndOfFile, "", location};
	}

	char p = peek();

	if (std::isalpha(p) \|\| p == '_')
	{
	return lex_identifier();
	}
	else if (std::isdigit(p))
	{
	return lex_number();
	}
	else if (p == '"')
	{
	return lex_string();
	}
	else if (p == '\'')
	{
	return lex_char();
	}

	advance();

	if (p == '(' \|\| p == ')' \|\| p == '{' \|\| p == '}' \|\| p == '[' \|\| p == ']' \|\| p == ',' \|\| p == ';' \|\| p == ':')
	{
	return Token{Token::Type::Punctuation, std::string(1, p), location};
	}

	switch (p)
	{
	case '(':
	case ')':
	case '{':
	case '}':
	case '[':
	case ']':
	case ',':
	case '.':
	case ';':
	case ':':
	return Token{Token::Type::Punctuation, std::string(1, p), location};

	case '+':
	case '-':
	if (peek() == '>')
	{
	advance();
	return Token{Token::Type::Operator, "->", location};
	}

	case '*':
	case '/':
	case '%':
	case '=':
	case '!':
	case '<':
	case '>':
	case '&':
	case '\|':
	if (peek() == '=')
	{
	advance();
	return Token{Token::Type::Operator, std::string(1, p) + '=', location};
	}

	return Token{Token::Type::Operator, std::string(1, p), location};
	}

	advance();
	return Token{Token::Type::Unknown, std::string(1, p), location};
	}

	Token Lexer::lex_identifier()
	{
	std::string lexeme;

	auto p = peek();

	while (std::isalnum(p) \|\| p == '_')
	{
	lexeme += p;
	advance();
	p = peek();
	}

	for (const auto &keyword : keywords)
	{
	if (lexeme == keyword)
	{
	return Token{Token::Type::Keyword, lexeme, location};
	}
	}

	return Token{Token::Type::Identifier, lexeme, location};
	}

	Token Lexer::lex_number()
	{
	std::string lexeme;

	auto p = peek();

	while (std::isdigit(p))
	{
	lexeme += p;
	advance();
	p = peek();
	}

	if (p != '.')
	{
	return Token{Token::Type::Integer, lexeme, location};
	}

	lexeme += p;
	advance();

	p = peek();

	while (std::isdigit(p))
	{
	lexeme += p;
	advance();
	p = peek();
	}

	return Token{Token::Type::Float, lexeme, location};
	}

	Token Lexer::lex_string()
	{
	std::string lexeme;

	advance();

	auto p = peek();

	while (p != '"')
	{
	lexeme += p;
	advance();
	p = peek();
	}

	advance();

	return Token{Token::Type::String, lexeme, location};
	}

	Token Lexer::lex_char()
	{
	std::string lexeme;

	advance();

	auto p = peek();

	lexeme += p;
	advance();

	if (peek() != '\'')
	{
	return Token{Token::Type::Unknown, lexeme, location};
	}

	advance();

	return Token{Token::Type::Char, lexeme, location};
	}

	std::vector<Token> lex(const std::string &source, const std::string &file_name)
	{
	Lexer lexer(source, file_name);
	std::vector<Token> tokens;

	while (true)
	{
	auto token = lexer.next_token();

	if (token.type == Token::Type::EndOfFile)
	{
	break;
	}

	tokens.push_back(token);
	}

	return tokens;
	}
	#pragma once

	#include <string>
	#include <vector>

	#include "lexer/token.hpp"

	class Lexer
	{
	public:
	std::string file_name;
	std::string source;
	std::vector<Token> tokens;
	Location location;
	int index;

	Lexer(const std::string &source, const std::string &file_name = "<stdio>")
	: file_name(file_name), source(source), index(0), location({1, 1, file_name}) {}

	char peek() const;

	void advance();

	void skip_whitespace();

	Token next_token();

	Token lex_identifier();

	Token lex_number();

	Token lex_string();

	Token lex_char();
	};

	std::vector<Token> lex(const std::string &source, const std::string &file_name = "<stdio>");
	#pragma once

	#include <string>
	#include <iostream>

	class Location
	{
	public:
	int line;
	int column;
	std::string file;

	Location copy() const
	{
	return {line, column, file};
	}

	friend std::ostream &operator<<(std::ostream &os, const Location &location)
	{
	os << "Location {" << "line=" << location.line << ", "
	<< "column=" << location.column << ", "
	<< "file='" << location.file << "'"
	<< "}";
	return os;
	}
	};

	class Token
	{
	public:
	enum class Type
	{
	Identifier, // Variable name
	Integer, // Integer number
	Float, // Floating point number
	String, // String "Hello, World!"
	Char, // Character 'c'
	Operator, // Operator like +, -, *, /, etc.
	Punctuation, // Punctuation like ;, (, ), etc.
	Keyword, // Keyword like if, else, while, etc.
	EndOfFile, // End of file
	Unknown // Unknown token
	} type;
	std::string lexeme;
	Location location;

	friend std::ostream &operator<<(std::ostream &os, const Token &token)
	{
	std::string type;

	switch (token.type)
	{
	case Type::Identifier:
	type = "Identifier";
	break;
	case Type::Integer:
	type = "Integer";
	break;
	case Type::Float:
	type = "Float";
	break;
	case Type::String:
	type = "String";
	break;
	case Type::Char:
	type = "Char";
	break;
	case Type::Operator:
	type = "Operator";
	break;
	case Type::Punctuation:
	type = "Punctuation";
	break;
	case Type::Keyword:
	type = "Keyword";
	break;
	case Type::EndOfFile:
	type = "EndOfFile";
	break;
	case Type::Unknown:
	type = "Unknown";
	break;
	}

	os << "Token {"
	<< "type=" << type << ", "
	<< "lexeme='" << token.lexeme << "', "
	<< "location=" << token.location << "}";
	return os;
	}
	};