Skip to content

Instantly share code, notes, and snippets.

@markhc
Last active June 6, 2019 18:56
Show Gist options
  • Save markhc/a2ea81dcab97d9744cb5f2dab2bd5b70 to your computer and use it in GitHub Desktop.
Save markhc/a2ea81dcab97d9744cb5f2dab2bd5b70 to your computer and use it in GitHub Desktop.
Lexer code
#include "Lexer.hpp"
namespace sage
{
namespace detail
{
// ----------------------------------------------------------------------------
inline char getCharAndAdvance(const char*& ptr)
{
return *ptr++;
}
// ----------------------------------------------------------------------------
inline bool isHorizontalWhitespace(char c)
{
return c == ' ' || c == '\t' || c == '\f' || c == '\v';
}
// ----------------------------------------------------------------------------
inline bool isVerticalWhitespace(char c)
{
return c == '\r' || c == '\n';
}
// ----------------------------------------------------------------------------
Lexer::Lexer(std::string input)
: m_input(std::move(input))
, m_bufferStart(m_input.data())
, m_bufferEnd(m_input.data() + m_input.size())
, m_bufferPtr(m_input.data())
{
}
// ----------------------------------------------------------------------------
bool Lexer::skipWhitespace(Token& result, const char* curPtr)
{
auto c = *curPtr;
// Skip consecutive spaces.
while(true) {
if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
// if we have something other than whitespace, we're done.
break;
}
c = *++curPtr;
}
// If the client wants us to return whitespace, return it now.
if(m_keepWhitespace) {
formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
return true;
}
m_bufferPtr = curPtr;
return false;
}
// ----------------------------------------------------------------------------
bool Lexer::skipLineComment(Token& result, const char* curPtr)
{
// Scan over the body of the comment. The common case, when scanning, is that
// the comment contains normal ascii characters with nothing interesting in
// them. As such, optimize for this case with the inner loop.
//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
char C;
while(true) {
C = *curPtr;
// Skip over characters in the fast loop.
while(C != 0 && // Potentially EOF.
C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++curPtr;
break;
}
// If we are returning comments as tokens, return this comment as a token.
if(m_keepComments) {
formTokenWithChars(result, curPtr, Token::Kind::Comment);
return true;
}
// Otherwise, eat the \n character. We don't care if this is a \n\r or
// \r\n sequence. This is an efficiency hack (because we know the \n can't
// contribute to another token), it isn't needed for correctness. Note that
// this is ok even in KeepWhitespaceMode, because we would have returned the
// comment above in that mode.
++curPtr;
m_bufferPtr = curPtr;
return false;
}
// ----------------------------------------------------------------------------
bool Lexer::lexIdentifier(Token& result, const char* curPtr)
{
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
unsigned char C = *curPtr++;
while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') ||
(C >= '0' && C <= '9') || C == '_')
C = *curPtr++;
--curPtr; // Back up over the skipped character.
formTokenWithChars(result, curPtr, Token::Kind::Identifier);
return true;
}
// ----------------------------------------------------------------------------
bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
{
(void)result;
(void)curPtr;
throw std::exception("Not implemented yet");
}
// ----------------------------------------------------------------------------
bool Lexer::lexStringLiteral(Token& result,
const char* curPtr,
bool isSingleQuoted)
{
auto const quote = isSingleQuoted ? '\'' : '"';
char c = getCharAndAdvance(curPtr);
while(c != quote) {
// Skip escaped characters.
if(c == '\\')
c = getCharAndAdvance(curPtr);
if(c == '\n' || c == '\r' ||
(c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
return true;
}
c = getCharAndAdvance(curPtr);
}
formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
return true;
}
// ----------------------------------------------------------------------------
void Lexer::formTokenWithChars(Token& result,
const char* tokEnd,
Token::Kind kind)
{
auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);
result.setKind(kind);
result.setLength(tokLen);
result.setLocation(getSourceLocation(m_bufferPtr));
result.setIdentifier(m_bufferPtr, tokLen);
m_bufferPtr = tokEnd;
}
SourceLocation Lexer::getSourceLocation(const char* ptr)
{
auto loc = SourceLocation{};
auto curPtr = m_bufferStart;
loc.line = 1;
while(curPtr < ptr) {
if(isVerticalWhitespace(*curPtr++)) {
loc.line++;
loc.column = 0;
} else {
loc.column++;
}
}
return loc;
}
// ----------------------------------------------------------------------------
bool Lexer::lexToken(Token& result)
{
LexNextToken:
const char* curPtr = m_bufferPtr;
if(curPtr > m_bufferEnd)
return false;
// Small amounts of horizontal whitespace are very common between tokens.
if((*curPtr == ' ') || (*curPtr == '\t')) {
++curPtr;
while((*curPtr == ' ') || (*curPtr == '\t'))
++curPtr;
if(m_keepWhitespace) {
// If the user wants to keep whitespaces
// form a new token and return
formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
return true;
}
m_bufferPtr = curPtr;
}
auto c = getCharAndAdvance(curPtr);
auto kind = Token::Kind::Unknown;
// Lex identifiers and constants
if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
return lexIdentifier(result, curPtr);
if(c >= '0' && c <= '9')
return lexNumericConstant(result, curPtr);
switch(c) {
case '\0':
// Found EOF?
if(curPtr - 1 == m_bufferEnd) {
formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
return true;
}
if(skipWhitespace(result, curPtr))
return true;
goto LexNextToken;
case '\r':
if(*curPtr == '\n')
c = getCharAndAdvance(curPtr);
[[fallthrough]];
case '\n':
if(skipWhitespace(result, curPtr))
return true;
// We only saw whitespace, so just try again
goto LexNextToken;
case ' ':
case '\t':
case '\f':
case '\v':
SkipHorizontalWhitespace:
if(skipWhitespace(result, curPtr))
return true;
SkipIgnoredUnits:
curPtr = m_bufferPtr;
// If the next token is obviously a // or /* */ comment, skip it
// efficiently too (without going through the big switch stmt).
if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
if(skipLineComment(result, curPtr + 2))
return true;
goto SkipIgnoredUnits;
} else if(isHorizontalWhitespace(*curPtr)) {
goto SkipHorizontalWhitespace;
}
// We only saw whitespace, so just try again
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
case '\'':
return lexStringLiteral(result, curPtr, true);
case '"':
return lexStringLiteral(result, curPtr, false);
case '?':
kind = Token::Kind::Question;
break;
case '[':
kind = Token::Kind::LeftSquare;
break;
case ']':
kind = Token::Kind::RightSquare;
break;
case '(':
kind = Token::Kind::LeftParen;
break;
case ')':
kind = Token::Kind::RightParen;
break;
case '{':
kind = Token::Kind::LeftCurly;
break;
case '}':
kind = Token::Kind::RightCurly;
break;
case '.': {
auto const next = *curPtr;
if(next >= '0' && next <= '9') {
return lexNumericConstant(result, curPtr);
} else {
kind = Token::Kind::Period;
}
}
case '*':
kind = Token::Kind::Star;
break;
case '+':
kind = Token::Kind::Plus;
break;
case '-':
kind = Token::Kind::Minus;
break;
case '!':
kind = Token::Kind::Exclam;
break;
case '/':
if(*curPtr == '/') {
if(skipLineComment(result, curPtr + 1))
return true; // There is a token to return.
// It is common for the tokens immediately after a // comment to be
// whitespace (indentation for the next line). Instead of going through
// the big switch, handle it efficiently now.
goto SkipIgnoredUnits;
}
kind = Token::Kind::Slash;
break;
case '>':
kind = Token::Kind::Greater;
break;
case ':':
kind = Token::Kind::Colon;
break;
case ';':
kind = Token::Kind::SemiColon;
break;
case ',':
kind = Token::Kind::Comma;
break;
case '#':
kind = Token::Kind::Hash;
break;
default:
kind = Token::Kind::Unknown;
break;
}
formTokenWithChars(result, curPtr, kind);
return true;
}
// ----------------------------------------------------------------------------
std::vector<Token> Lexer::lexAllTokens()
{
auto result = std::vector<Token>{};
auto token = Token{};
while(lexToken(token)) {
result.emplace_back(std::move(token));
}
return result;
}
// ----------------------------------------------------------------------------
} // namespace detail
} // namespace sage
#pragma once
#include <ostream>
#include <string>
#include <vector>
#include "Token.hpp"
namespace sage
{
namespace detail
{
class Lexer
{
public:
Lexer(std::string input);
bool lexToken(Token& result);
std::vector<Token> lexAllTokens();
void keepWhitespace(bool keep) { m_keepWhitespace = keep; }
void keepComments(bool keep) { m_keepComments = keep; }
private:
bool skipWhitespace(Token& result, const char* curPtr);
bool skipLineComment(Token& result, const char* curPtr);
bool lexIdentifier(Token& result, const char* curPtr);
bool lexNumericConstant(Token& result, const char* curPtr);
bool lexStringLiteral(Token& result, const char* curPtr, bool isSingleQuoted);
void formTokenWithChars(Token& result, const char* tokEnd, Token::Kind kind);
SourceLocation getSourceLocation(const char* ptr);
std::string m_input;
const char* m_bufferStart;
const char* m_bufferEnd;
const char* m_bufferPtr;
bool m_keepComments{true};
bool m_keepWhitespace{false};
};
} // namespace detail
} // namespace sage
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment