Last active
June 6, 2019 18:56
-
-
Save markhc/a2ea81dcab97d9744cb5f2dab2bd5b70 to your computer and use it in GitHub Desktop.
Lexer code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "Lexer.hpp" | |
namespace sage | |
{ | |
namespace detail | |
{ | |
// ---------------------------------------------------------------------------- | |
inline char getCharAndAdvance(const char*& ptr) | |
{ | |
return *ptr++; | |
} | |
// ---------------------------------------------------------------------------- | |
inline bool isHorizontalWhitespace(char c) | |
{ | |
return c == ' ' || c == '\t' || c == '\f' || c == '\v'; | |
} | |
// ---------------------------------------------------------------------------- | |
inline bool isVerticalWhitespace(char c) | |
{ | |
return c == '\r' || c == '\n'; | |
} | |
// ---------------------------------------------------------------------------- | |
Lexer::Lexer(std::string input) | |
: m_input(std::move(input)) | |
, m_bufferStart(m_input.data()) | |
, m_bufferEnd(m_input.data() + m_input.size()) | |
, m_bufferPtr(m_input.data()) | |
{ | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::skipWhitespace(Token& result, const char* curPtr) | |
{ | |
auto c = *curPtr; | |
// Skip consecutive spaces. | |
while(true) { | |
if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) { | |
// if we have something other than whitespace, we're done. | |
break; | |
} | |
c = *++curPtr; | |
} | |
// If the client wants us to return whitespace, return it now. | |
if(m_keepWhitespace) { | |
formTokenWithChars(result, curPtr, Token::Kind::Whitespace); | |
return true; | |
} | |
m_bufferPtr = curPtr; | |
return false; | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::skipLineComment(Token& result, const char* curPtr) | |
{ | |
// Scan over the body of the comment. The common case, when scanning, is that | |
// the comment contains normal ascii characters with nothing interesting in | |
// them. As such, optimize for this case with the inner loop. | |
// | |
// This loop terminates with CurPtr pointing at the newline (or end of buffer) | |
// character that ends the line comment. | |
char C; | |
while(true) { | |
C = *curPtr; | |
// Skip over characters in the fast loop. | |
while(C != 0 && // Potentially EOF. | |
C != '\n' && C != '\r') // Newline or DOS-style newline. | |
C = *++curPtr; | |
break; | |
} | |
// If we are returning comments as tokens, return this comment as a token. | |
if(m_keepComments) { | |
formTokenWithChars(result, curPtr, Token::Kind::Comment); | |
return true; | |
} | |
// Otherwise, eat the \n character. We don't care if this is a \n\r or | |
// \r\n sequence. This is an efficiency hack (because we know the \n can't | |
// contribute to another token), it isn't needed for correctness. Note that | |
// this is ok even in KeepWhitespaceMode, because we would have returned the | |
// comment above in that mode. | |
++curPtr; | |
m_bufferPtr = curPtr; | |
return false; | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::lexIdentifier(Token& result, const char* curPtr) | |
{ | |
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] | |
unsigned char C = *curPtr++; | |
while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') || | |
(C >= '0' && C <= '9') || C == '_') | |
C = *curPtr++; | |
--curPtr; // Back up over the skipped character. | |
formTokenWithChars(result, curPtr, Token::Kind::Identifier); | |
return true; | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::lexNumericConstant(Token& result, const char* curPtr) | |
{ | |
(void)result; | |
(void)curPtr; | |
throw std::exception("Not implemented yet"); | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::lexStringLiteral(Token& result, | |
const char* curPtr, | |
bool isSingleQuoted) | |
{ | |
auto const quote = isSingleQuoted ? '\'' : '"'; | |
char c = getCharAndAdvance(curPtr); | |
while(c != quote) { | |
// Skip escaped characters. | |
if(c == '\\') | |
c = getCharAndAdvance(curPtr); | |
if(c == '\n' || c == '\r' || | |
(c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file. | |
formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown); | |
return true; | |
} | |
c = getCharAndAdvance(curPtr); | |
} | |
formTokenWithChars(result, curPtr, Token::Kind::StringLiteral); | |
return true; | |
} | |
// ---------------------------------------------------------------------------- | |
void Lexer::formTokenWithChars(Token& result, | |
const char* tokEnd, | |
Token::Kind kind) | |
{ | |
auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr); | |
result.setKind(kind); | |
result.setLength(tokLen); | |
result.setLocation(getSourceLocation(m_bufferPtr)); | |
result.setIdentifier(m_bufferPtr, tokLen); | |
m_bufferPtr = tokEnd; | |
} | |
SourceLocation Lexer::getSourceLocation(const char* ptr) | |
{ | |
auto loc = SourceLocation{}; | |
auto curPtr = m_bufferStart; | |
loc.line = 1; | |
while(curPtr < ptr) { | |
if(isVerticalWhitespace(*curPtr++)) { | |
loc.line++; | |
loc.column = 0; | |
} else { | |
loc.column++; | |
} | |
} | |
return loc; | |
} | |
// ---------------------------------------------------------------------------- | |
bool Lexer::lexToken(Token& result) | |
{ | |
LexNextToken: | |
const char* curPtr = m_bufferPtr; | |
if(curPtr > m_bufferEnd) | |
return false; | |
// Small amounts of horizontal whitespace are very common between tokens. | |
if((*curPtr == ' ') || (*curPtr == '\t')) { | |
++curPtr; | |
while((*curPtr == ' ') || (*curPtr == '\t')) | |
++curPtr; | |
if(m_keepWhitespace) { | |
// If the user wants to keep whitespaces | |
// form a new token and return | |
formTokenWithChars(result, curPtr, Token::Kind::Whitespace); | |
return true; | |
} | |
m_bufferPtr = curPtr; | |
} | |
auto c = getCharAndAdvance(curPtr); | |
auto kind = Token::Kind::Unknown; | |
// Lex identifiers and constants | |
if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) | |
return lexIdentifier(result, curPtr); | |
if(c >= '0' && c <= '9') | |
return lexNumericConstant(result, curPtr); | |
switch(c) { | |
case '\0': | |
// Found EOF? | |
if(curPtr - 1 == m_bufferEnd) { | |
formTokenWithChars(result, curPtr, Token::Kind::EndOfFile); | |
return true; | |
} | |
if(skipWhitespace(result, curPtr)) | |
return true; | |
goto LexNextToken; | |
case '\r': | |
if(*curPtr == '\n') | |
c = getCharAndAdvance(curPtr); | |
[[fallthrough]]; | |
case '\n': | |
if(skipWhitespace(result, curPtr)) | |
return true; | |
// We only saw whitespace, so just try again | |
goto LexNextToken; | |
case ' ': | |
case '\t': | |
case '\f': | |
case '\v': | |
SkipHorizontalWhitespace: | |
if(skipWhitespace(result, curPtr)) | |
return true; | |
SkipIgnoredUnits: | |
curPtr = m_bufferPtr; | |
// If the next token is obviously a // or /* */ comment, skip it | |
// efficiently too (without going through the big switch stmt). | |
if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) { | |
if(skipLineComment(result, curPtr + 2)) | |
return true; | |
goto SkipIgnoredUnits; | |
} else if(isHorizontalWhitespace(*curPtr)) { | |
goto SkipHorizontalWhitespace; | |
} | |
// We only saw whitespace, so just try again | |
// (We manually eliminate the tail call to avoid recursion.) | |
goto LexNextToken; | |
case '\'': | |
return lexStringLiteral(result, curPtr, true); | |
case '"': | |
return lexStringLiteral(result, curPtr, false); | |
case '?': | |
kind = Token::Kind::Question; | |
break; | |
case '[': | |
kind = Token::Kind::LeftSquare; | |
break; | |
case ']': | |
kind = Token::Kind::RightSquare; | |
break; | |
case '(': | |
kind = Token::Kind::LeftParen; | |
break; | |
case ')': | |
kind = Token::Kind::RightParen; | |
break; | |
case '{': | |
kind = Token::Kind::LeftCurly; | |
break; | |
case '}': | |
kind = Token::Kind::RightCurly; | |
break; | |
case '.': { | |
auto const next = *curPtr; | |
if(next >= '0' && next <= '9') { | |
return lexNumericConstant(result, curPtr); | |
} else { | |
kind = Token::Kind::Period; | |
} | |
} | |
case '*': | |
kind = Token::Kind::Star; | |
break; | |
case '+': | |
kind = Token::Kind::Plus; | |
break; | |
case '-': | |
kind = Token::Kind::Minus; | |
break; | |
case '!': | |
kind = Token::Kind::Exclam; | |
break; | |
case '/': | |
if(*curPtr == '/') { | |
if(skipLineComment(result, curPtr + 1)) | |
return true; // There is a token to return. | |
// It is common for the tokens immediately after a // comment to be | |
// whitespace (indentation for the next line). Instead of going through | |
// the big switch, handle it efficiently now. | |
goto SkipIgnoredUnits; | |
} | |
kind = Token::Kind::Slash; | |
break; | |
case '>': | |
kind = Token::Kind::Greater; | |
break; | |
case ':': | |
kind = Token::Kind::Colon; | |
break; | |
case ';': | |
kind = Token::Kind::SemiColon; | |
break; | |
case ',': | |
kind = Token::Kind::Comma; | |
break; | |
case '#': | |
kind = Token::Kind::Hash; | |
break; | |
default: | |
kind = Token::Kind::Unknown; | |
break; | |
} | |
formTokenWithChars(result, curPtr, kind); | |
return true; | |
} | |
// ---------------------------------------------------------------------------- | |
std::vector<Token> Lexer::lexAllTokens() | |
{ | |
auto result = std::vector<Token>{}; | |
auto token = Token{}; | |
while(lexToken(token)) { | |
result.emplace_back(std::move(token)); | |
} | |
return result; | |
} | |
// ---------------------------------------------------------------------------- | |
} // namespace detail | |
} // namespace sage |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <ostream> | |
#include <string> | |
#include <vector> | |
#include "Token.hpp" | |
namespace sage | |
{ | |
namespace detail | |
{ | |
class Lexer | |
{ | |
public: | |
Lexer(std::string input); | |
bool lexToken(Token& result); | |
std::vector<Token> lexAllTokens(); | |
void keepWhitespace(bool keep) { m_keepWhitespace = keep; } | |
void keepComments(bool keep) { m_keepComments = keep; } | |
private: | |
bool skipWhitespace(Token& result, const char* curPtr); | |
bool skipLineComment(Token& result, const char* curPtr); | |
bool lexIdentifier(Token& result, const char* curPtr); | |
bool lexNumericConstant(Token& result, const char* curPtr); | |
bool lexStringLiteral(Token& result, const char* curPtr, bool isSingleQuoted); | |
void formTokenWithChars(Token& result, const char* tokEnd, Token::Kind kind); | |
SourceLocation getSourceLocation(const char* ptr); | |
std::string m_input; | |
const char* m_bufferStart; | |
const char* m_bufferEnd; | |
const char* m_bufferPtr; | |
bool m_keepComments{true}; | |
bool m_keepWhitespace{false}; | |
}; | |
} // namespace detail | |
} // namespace sage |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment