-
-
Save arrieta/1a309138689e09375b90b3b1aa768e20 to your computer and use it in GitHub Desktop.
| // A simple Lexer meant to demonstrate a few theoretical concepts. It can | |
| // support several parser concepts and is very fast (though speed is not its | |
| // design goal). | |
| // | |
| // J. Arrieta, Nabla Zero Labs | |
| // | |
| // This code is released under the MIT License. | |
| // | |
| // Copyright 2018 Nabla Zero Labs | |
| // | |
| // Permission is hereby granted, free of charge, to any person obtaining a copy | |
| // of this software and associated documentation files(the "Software"), to deal | |
| // in the Software without restriction, including without limitation the rights | |
| // to use, copy, modify, merge, publish ,distribute, sublicense, and / or sell | |
| // copies of the Software, and to permit persons to whom the Software is | |
| // furnished to do so, subject to the following conditions: | |
| // | |
| // The above copyright notice and this permission notice shall be included in | |
| // all copies or substantial portions of the Software. | |
| // | |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE | |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| // SOFTWARE. | |
| #include <string> | |
| class Token { | |
| public: | |
| enum class Kind { | |
| Number, | |
| Identifier, | |
| LeftParen, | |
| RightParen, | |
| LeftSquare, | |
| RightSquare, | |
| LeftCurly, | |
| RightCurly, | |
| LessThan, | |
| GreaterThan, | |
| Equal, | |
| Plus, | |
| Minus, | |
| Asterisk, | |
| Slash, | |
| Hash, | |
| Dot, | |
| Comma, | |
| Colon, | |
| Semicolon, | |
| SingleQuote, | |
| DoubleQuote, | |
| Comment, | |
| Pipe, | |
| End, | |
| Unexpected, | |
| }; | |
| Token(Kind kind) noexcept : m_kind{kind} {} | |
| Token(Kind kind, const char* beg, std::size_t len) noexcept | |
| : m_kind{kind}, m_lexeme(beg, len) {} | |
| Token(Kind kind, const char* beg, const char* end) noexcept | |
| : m_kind{kind}, m_lexeme(beg, std::distance(beg, end)) {} | |
| Kind kind() const noexcept { return m_kind; } | |
| void kind(Kind kind) noexcept { m_kind = kind; } | |
| bool is(Kind kind) const noexcept { return m_kind == kind; } | |
| bool is_not(Kind kind) const noexcept { return m_kind != kind; } | |
| bool is_one_of(Kind k1, Kind k2) const noexcept { return is(k1) || is(k2); } | |
| template <typename... Ts> | |
| bool is_one_of(Kind k1, Kind k2, Ts... ks) const noexcept { | |
| return is(k1) || is_one_of(k2, ks...); | |
| } | |
| std::string_view lexeme() const noexcept { return m_lexeme; } | |
| void lexeme(std::string_view lexeme) noexcept { | |
| m_lexeme = std::move(lexeme); | |
| } | |
| private: | |
| Kind m_kind{}; | |
| std::string_view m_lexeme{}; | |
| }; | |
| class Lexer { | |
| public: | |
| Lexer(const char* beg) noexcept : m_beg{beg} {} | |
| Token next() noexcept; | |
| private: | |
| Token identifier() noexcept; | |
| Token number() noexcept; | |
| Token slash_or_comment() noexcept; | |
| Token atom(Token::Kind) noexcept; | |
| char peek() const noexcept { return *m_beg; } | |
| char get() noexcept { return *m_beg++; } | |
| const char* m_beg = nullptr; | |
| }; | |
| bool is_space(char c) noexcept { | |
| switch (c) { | |
| case ' ': | |
| case '\t': | |
| case '\r': | |
| case '\n': | |
| return true; | |
| default: | |
| return false; | |
| } | |
| } | |
| bool is_digit(char c) noexcept { | |
| switch (c) { | |
| case '0': | |
| case '1': | |
| case '2': | |
| case '3': | |
| case '4': | |
| case '5': | |
| case '6': | |
| case '7': | |
| case '8': | |
| case '9': | |
| return true; | |
| default: | |
| return false; | |
| } | |
| } | |
| bool is_identifier_char(char c) noexcept { | |
| switch (c) { | |
| case 'a': | |
| case 'b': | |
| case 'c': | |
| case 'd': | |
| case 'e': | |
| case 'f': | |
| case 'g': | |
| case 'h': | |
| case 'i': | |
| case 'j': | |
| case 'k': | |
| case 'l': | |
| case 'm': | |
| case 'n': | |
| case 'o': | |
| case 'p': | |
| case 'q': | |
| case 'r': | |
| case 's': | |
| case 't': | |
| case 'u': | |
| case 'v': | |
| case 'w': | |
| case 'x': | |
| case 'y': | |
| case 'z': | |
| case 'A': | |
| case 'B': | |
| case 'C': | |
| case 'D': | |
| case 'E': | |
| case 'F': | |
| case 'G': | |
| case 'H': | |
| case 'I': | |
| case 'J': | |
| case 'K': | |
| case 'L': | |
| case 'M': | |
| case 'N': | |
| case 'O': | |
| case 'P': | |
| case 'Q': | |
| case 'R': | |
| case 'S': | |
| case 'T': | |
| case 'U': | |
| case 'V': | |
| case 'W': | |
| case 'X': | |
| case 'Y': | |
| case 'Z': | |
| case '0': | |
| case '1': | |
| case '2': | |
| case '3': | |
| case '4': | |
| case '5': | |
| case '6': | |
| case '7': | |
| case '8': | |
| case '9': | |
| case '_': | |
| return true; | |
| default: | |
| return false; | |
| } | |
| } | |
| Token Lexer::atom(Token::Kind kind) noexcept { return Token(kind, m_beg++, 1); } | |
| Token Lexer::next() noexcept { | |
| while (is_space(peek())) get(); | |
| switch (peek()) { | |
| case '\0': | |
| return Token(Token::Kind::End, m_beg, 1); | |
| default: | |
| return atom(Token::Kind::Unexpected); | |
| case 'a': | |
| case 'b': | |
| case 'c': | |
| case 'd': | |
| case 'e': | |
| case 'f': | |
| case 'g': | |
| case 'h': | |
| case 'i': | |
| case 'j': | |
| case 'k': | |
| case 'l': | |
| case 'm': | |
| case 'n': | |
| case 'o': | |
| case 'p': | |
| case 'q': | |
| case 'r': | |
| case 's': | |
| case 't': | |
| case 'u': | |
| case 'v': | |
| case 'w': | |
| case 'x': | |
| case 'y': | |
| case 'z': | |
| case 'A': | |
| case 'B': | |
| case 'C': | |
| case 'D': | |
| case 'E': | |
| case 'F': | |
| case 'G': | |
| case 'H': | |
| case 'I': | |
| case 'J': | |
| case 'K': | |
| case 'L': | |
| case 'M': | |
| case 'N': | |
| case 'O': | |
| case 'P': | |
| case 'Q': | |
| case 'R': | |
| case 'S': | |
| case 'T': | |
| case 'U': | |
| case 'V': | |
| case 'W': | |
| case 'X': | |
| case 'Y': | |
| case 'Z': | |
| return identifier(); | |
| case '0': | |
| case '1': | |
| case '2': | |
| case '3': | |
| case '4': | |
| case '5': | |
| case '6': | |
| case '7': | |
| case '8': | |
| case '9': | |
| return number(); | |
| case '(': | |
| return atom(Token::Kind::LeftParen); | |
| case ')': | |
| return atom(Token::Kind::RightParen); | |
| case '[': | |
| return atom(Token::Kind::LeftSquare); | |
| case ']': | |
| return atom(Token::Kind::RightSquare); | |
| case '{': | |
| return atom(Token::Kind::LeftCurly); | |
| case '}': | |
| return atom(Token::Kind::RightCurly); | |
| case '<': | |
| return atom(Token::Kind::LessThan); | |
| case '>': | |
| return atom(Token::Kind::GreaterThan); | |
| case '=': | |
| return atom(Token::Kind::Equal); | |
| case '+': | |
| return atom(Token::Kind::Plus); | |
| case '-': | |
| return atom(Token::Kind::Minus); | |
| case '*': | |
| return atom(Token::Kind::Asterisk); | |
| case '/': | |
| return slash_or_comment(); | |
| case '#': | |
| return atom(Token::Kind::Hash); | |
| case '.': | |
| return atom(Token::Kind::Dot); | |
| case ',': | |
| return atom(Token::Kind::Comma); | |
| case ':': | |
| return atom(Token::Kind::Colon); | |
| case ';': | |
| return atom(Token::Kind::Semicolon); | |
| case '\'': | |
| return atom(Token::Kind::SingleQuote); | |
| case '"': | |
| return atom(Token::Kind::DoubleQuote); | |
| case '|': | |
| return atom(Token::Kind::Pipe); | |
| } | |
| } | |
| Token Lexer::identifier() noexcept { | |
| const char* start = m_beg; | |
| get(); | |
| while (is_identifier_char(peek())) get(); | |
| return Token(Token::Kind::Identifier, start, m_beg); | |
| } | |
| Token Lexer::number() noexcept { | |
| const char* start = m_beg; | |
| get(); | |
| while (is_digit(peek())) get(); | |
| return Token(Token::Kind::Number, start, m_beg); | |
| } | |
| Token Lexer::slash_or_comment() noexcept { | |
| const char* start = m_beg; | |
| get(); | |
| if (peek() == '/') { | |
| get(); | |
| start = m_beg; | |
| while (peek() != '\0') { | |
| if (get() == '\n') { | |
| return Token(Token::Kind::Comment, start, | |
| std::distance(start, m_beg) - 1); | |
| } | |
| } | |
| return Token(Token::Kind::Unexpected, m_beg, 1); | |
| } else { | |
| return Token(Token::Kind::Slash, start, 1); | |
| } | |
| } | |
| #include <iomanip> | |
| #include <iostream> | |
| std::ostream& operator<<(std::ostream& os, const Token::Kind& kind) { | |
| static const char* const names[]{ | |
| "Number", "Identifier", "LeftParen", "RightParen", "LeftSquare", | |
| "RightSquare", "LeftCurly", "RightCurly", "LessThan", "GreaterThan", | |
| "Equal", "Plus", "Minus", "Asterisk", "Slash", | |
| "Hash", "Dot", "Comma", "Colon", "Semicolon", | |
| "SingleQuote", "DoubleQuote", "Comment", "Pipe", "End", | |
| "Unexpected", | |
| }; | |
| return os << names[static_cast<int>(kind)]; | |
| } | |
| int main() { | |
| auto code = | |
| "x = 2\n" | |
| "// This is a comment.\n" | |
| "var x\n" | |
| "var y\n" | |
| "var f = function(x, y) { sin(x) * sin(y) + x * y; }\n" | |
| "der(f, x)\n" | |
| "var g = function(x, y) { 2 * (x + der(f, y)); } // der(f, y) is a " | |
| "matrix\n" | |
| "var r{3}; // Vector of three elements\n" | |
| "var J{12, 12}; // Matrix of 12x12 elements\n" | |
| "var dot = function(u{:}, v{:}) -> scalar {\n" | |
| " return u[i] * v[i]; // Einstein notation\n" | |
| "}\n" | |
| "var norm = function(u{:}) -> scalar { return sqrt(dot(u, u)); }\n" | |
| "<end>"; | |
| Lexer lex(code); | |
| for (auto token = lex.next(); | |
| not token.is_one_of(Token::Kind::End, Token::Kind::Unexpected); | |
| token = lex.next()) { | |
| std::cout << std::setw(12) << token.kind() << " |" << token.lexeme() | |
| << "|\n"; | |
| } | |
| } |
Thanks. What is license of the code?
Thanks. What is license of the code?
I also want to know the license.
Hello onurblt and asmwarrior. The code is released under the MIT License (I've updated the gist to reflect this fact). I hope you find it useful.
Hi, thanks for sharing your code. Is there any chance to add support for floats? If so, it would be better to use a finite state machine or just a regular expression?
Nice and easy example. Thanks! :)
@ice-bit for handling decimals as numbers just like those with integers is done
add this function:
bool is_decimal(char c) noexcept {
switch(c) {
case '.':
return true;
default:
return false;
}
}
and in the function Token Lexer::number() just make this very small change:
while (is_digit(peek()) || is_decimal(peek())) get();
With the following input
auto code =
"x = 2.066721823991;\n"
"var a = a.b()";
Output is:
Identifier |x|
Equal |=|
Number |2.066721823991|
Semicolon |;|
Identifier |var|
Identifier |a|
Equal |=|
Identifier |a|
Dot |.|
Identifier |b|
LeftParen |(|
RightParen |)|
@godoio the problem with that approach is that it would recognize, for example, 123.456.789 as a valid number. Parsing floats is not too difficult, but certainly much more involved. For that reason, I believe that floats belong in a parser, not in a lexer.
Nice code. Why not use the built in C functions instead of defining your own for is_digit, etc? You can just us the C function isdigit for numbers, isalpha for identifiers, etc.
@jakerieger No particular reason. Standard library functions would have worked.
As a minor detail, std::isdigit and std::isalpha expect an int. When one writes a lexer that lexes char, that's not a problem. However, if one were to write a lexer that lexes a stream of codepoints, then one would need to write their own is_xxx.
@arrieta Makes sense
MIT License
What would you recommend to anyone you want to make one form scratch with basic knowledge of C++ working on for my programing Language.
Sample run