markhc · June 6, 2019 18:56
diff --git a/Lexer.cpp b/Lexer.cpp
 #include "Lexer.hpp"

 namespace sage
 {
 namespace detail
 {
 // ----------------------------------------------------------------------------
 inline char getCharAndAdvance(const char*& ptr)
 {
  return *ptr++;
 }
 // ----------------------------------------------------------------------------
 inline bool isHorizontalWhitespace(char c)
 {
  return c == ' ' || c == '\t' || c == '\f' || c == '\v';
 }
 // ----------------------------------------------------------------------------
 inline bool isVerticalWhitespace(char c)
 {
  return c == '\r' || c == '\n';
 }
 // ----------------------------------------------------------------------------
 Lexer::Lexer(std::string input)
  : m_input(std::move(input))
  , m_bufferStart(m_input.data())
  , m_bufferEnd(m_input.data() + m_input.size())
  , m_bufferPtr(m_input.data())
 {
 }
 // ----------------------------------------------------------------------------
 bool Lexer::skipWhitespace(Token& result, const char* curPtr)
 {
  auto c = *curPtr;

  // Skip consecutive spaces.
  while(true) {
    if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
      // if we have something other than whitespace, we're done.
      break;
    }

    c = *++curPtr;
  }

  // If the client wants us to return whitespace, return it now.
  if(m_keepWhitespace) {
    formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
    return true;
  }

  m_bufferPtr = curPtr;
  return false;
 }
 // ----------------------------------------------------------------------------
 bool Lexer::skipLineComment(Token& result, const char* curPtr)
 {
  // Scan over the body of the comment.  The common case, when scanning, is that
  // the comment contains normal ascii characters with nothing interesting in
  // them.  As such, optimize for this case with the inner loop.
  //
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
  // character that ends the line comment.
  char C;
  while(true) {
    C = *curPtr;
    // Skip over characters in the fast loop.
    while(C != 0 &&               // Potentially EOF.
          C != '\n' && C != '\r') // Newline or DOS-style newline.
      C = *++curPtr;

    break;
  }

  // If we are returning comments as tokens, return this comment as a token.
  if(m_keepComments) {
    formTokenWithChars(result, curPtr, Token::Kind::Comment);
    return true;
  }

  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
  // contribute to another token), it isn't needed for correctness.  Note that
  // this is ok even in KeepWhitespaceMode, because we would have returned the
  // comment above in that mode.
  ++curPtr;

  m_bufferPtr = curPtr;
  return false;
 }
 // ----------------------------------------------------------------------------
 bool Lexer::lexIdentifier(Token& result, const char* curPtr)
 {
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  unsigned char C = *curPtr++;
  while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') ||
        (C >= '0' && C <= '9') || C == '_')
    C = *curPtr++;

  --curPtr; // Back up over the skipped character.

  formTokenWithChars(result, curPtr, Token::Kind::Identifier);

  return true;
 }
 // ----------------------------------------------------------------------------
 bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
 {
  (void)result;
  (void)curPtr;
  throw std::exception("Not implemented yet");
 }
 // ----------------------------------------------------------------------------
 bool Lexer::lexStringLiteral(Token&      result,
                             const char* curPtr,
                             bool        isSingleQuoted)
 {
  auto const  quote        = isSingleQuoted ? '\'' : '"';

  char c = getCharAndAdvance(curPtr);
  while(c != quote) {
    // Skip escaped characters.
    if(c == '\\')
      c = getCharAndAdvance(curPtr);

    if(c == '\n' || c == '\r' ||
       (c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
      formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
      return true;
    }

    c = getCharAndAdvance(curPtr);
  }

  formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
  return true;
 }
 // ----------------------------------------------------------------------------
 void Lexer::formTokenWithChars(Token&      result,
                               const char* tokEnd,
                               Token::Kind kind)
 {
  auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);

  result.setKind(kind);
  result.setLength(tokLen);
  result.setLocation(getSourceLocation(m_bufferPtr));
  result.setIdentifier(m_bufferPtr, tokLen);

  m_bufferPtr = tokEnd;
 }
 SourceLocation Lexer::getSourceLocation(const char* ptr)
 {
  auto loc    = SourceLocation{};
  auto curPtr = m_bufferStart;

  loc.line = 1;
  while(curPtr < ptr) {
    if(isVerticalWhitespace(*curPtr++)) {
      loc.line++;
      loc.column = 0;
    } else {
      loc.column++;
    }
  }

  return loc;
 }
 // ----------------------------------------------------------------------------
 bool Lexer::lexToken(Token& result)
 {
 LexNextToken:
  const char* curPtr = m_bufferPtr;

  if(curPtr > m_bufferEnd)
    return false;

  // Small amounts of horizontal whitespace are very common between tokens.
  if((*curPtr == ' ') || (*curPtr == '\t')) {
    ++curPtr;
    while((*curPtr == ' ') || (*curPtr == '\t'))
      ++curPtr;

    if(m_keepWhitespace) {
      // If the user wants to keep whitespaces
      // form a new token and return
      formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
      return true;
    }

    m_bufferPtr = curPtr;
  }

  auto c    = getCharAndAdvance(curPtr);
  auto kind = Token::Kind::Unknown;

  // Lex identifiers and constants
  if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
    return lexIdentifier(result, curPtr);
  if(c >= '0' && c <= '9')
    return lexNumericConstant(result, curPtr);

  switch(c) {
    case '\0':
      // Found EOF?
      if(curPtr - 1 == m_bufferEnd) {
        formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
        return true;
      }

      if(skipWhitespace(result, curPtr))
        return true;

      goto LexNextToken;
    case '\r':
      if(*curPtr == '\n')
        c = getCharAndAdvance(curPtr);
      [[fallthrough]];
    case '\n':
      if(skipWhitespace(result, curPtr))
        return true;

      // We only saw whitespace, so just try again
      goto LexNextToken;
    case ' ':
    case '\t':
    case '\f':
    case '\v':
    SkipHorizontalWhitespace:
      if(skipWhitespace(result, curPtr))
        return true;
    SkipIgnoredUnits:
      curPtr = m_bufferPtr;
      // If the next token is obviously a // or /* */ comment, skip it
      // efficiently too (without going through the big switch stmt).
      if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
        if(skipLineComment(result, curPtr + 2))
          return true;
        goto SkipIgnoredUnits;
      } else if(isHorizontalWhitespace(*curPtr)) {
        goto SkipHorizontalWhitespace;
      }
      // We only saw whitespace, so just try again
      // (We manually eliminate the tail call to avoid recursion.)
      goto LexNextToken;
    case '\'':
      return lexStringLiteral(result, curPtr, true);
    case '"':
      return lexStringLiteral(result, curPtr, false);
    case '?':
      kind = Token::Kind::Question;
      break;
    case '[':
      kind = Token::Kind::LeftSquare;
      break;
    case ']':
      kind = Token::Kind::RightSquare;
      break;
    case '(':
      kind = Token::Kind::LeftParen;
      break;
    case ')':
      kind = Token::Kind::RightParen;
      break;
    case '{':
      kind = Token::Kind::LeftCurly;
      break;
    case '}':
      kind = Token::Kind::RightCurly;
      break;
    case '.': {
      auto const next = *curPtr;
      if(next >= '0' && next <= '9') {
        return lexNumericConstant(result, curPtr);
      } else {
        kind = Token::Kind::Period;
      }
    }
    case '*':
      kind = Token::Kind::Star;
      break;
    case '+':
      kind = Token::Kind::Plus;
      break;
    case '-':
      kind = Token::Kind::Minus;
      break;
    case '!':
      kind = Token::Kind::Exclam;
      break;
    case '/':
      if(*curPtr == '/') {
        if(skipLineComment(result, curPtr + 1))
          return true; // There is a token to return.

        // It is common for the tokens immediately after a // comment to be
        // whitespace (indentation for the next line).  Instead of going through
        // the big switch, handle it efficiently now.
        goto SkipIgnoredUnits;
      }
      kind = Token::Kind::Slash;
      break;
    case '>':
      kind = Token::Kind::Greater;
      break;
    case ':':
      kind = Token::Kind::Colon;
      break;
    case ';':
      kind = Token::Kind::SemiColon;
      break;
    case ',':
      kind = Token::Kind::Comma;
      break;
    case '#':
      kind = Token::Kind::Hash;
      break;
    default:
      kind = Token::Kind::Unknown;
      break;
  }
  formTokenWithChars(result, curPtr, kind);
  return true;
 }
 // ----------------------------------------------------------------------------
 std::vector<Token> Lexer::lexAllTokens()
 {
  auto result = std::vector<Token>{};
  auto token  = Token{};
  while(lexToken(token)) {
    result.emplace_back(std::move(token));
  }
  return result;
 }
 // ----------------------------------------------------------------------------
 } // namespace detail
 } // namespace sage
diff --git a/Lexer.hpp b/Lexer.hpp
 #pragma once

 #include <ostream>
 #include <string>
 #include <vector>

 #include "Token.hpp"
 
 namespace sage
 {
 namespace detail
 {
 class Lexer
 {
 public:
  Lexer(std::string input);

  bool               lexToken(Token& result);
  std::vector<Token> lexAllTokens();

  void keepWhitespace(bool keep) { m_keepWhitespace = keep; }
  void keepComments(bool keep)   { m_keepComments = keep; }

 private:
  bool skipWhitespace(Token& result, const char* curPtr);
  bool skipLineComment(Token& result, const char* curPtr);
  bool lexIdentifier(Token& result, const char* curPtr);
  bool lexNumericConstant(Token& result, const char* curPtr);
  bool lexStringLiteral(Token& result, const char* curPtr, bool isSingleQuoted);

  void           formTokenWithChars(Token& result, const char* tokEnd, Token::Kind kind);
  SourceLocation getSourceLocation(const char* ptr);

  std::string m_input;
  const char* m_bufferStart;
  const char* m_bufferEnd;
  const char* m_bufferPtr;

  bool m_keepComments{true};
  bool m_keepWhitespace{false};
 };
 } // namespace detail
 } // namespace sage
	#include "Lexer.hpp"

	namespace sage
	{
	namespace detail
	{
	// ----------------------------------------------------------------------------
	inline char getCharAndAdvance(const char*& ptr)
	{
	return *ptr++;
	}
	// ----------------------------------------------------------------------------
	inline bool isHorizontalWhitespace(char c)
	{
	return c == ' ' \|\| c == '\t' \|\| c == '\f' \|\| c == '\v';
	}
	// ----------------------------------------------------------------------------
	inline bool isVerticalWhitespace(char c)
	{
	return c == '\r' \|\| c == '\n';
	}
	// ----------------------------------------------------------------------------
	Lexer::Lexer(std::string input)
	: m_input(std::move(input))
	, m_bufferStart(m_input.data())
	, m_bufferEnd(m_input.data() + m_input.size())
	, m_bufferPtr(m_input.data())
	{
	}
	// ----------------------------------------------------------------------------
	bool Lexer::skipWhitespace(Token& result, const char* curPtr)
	{
	auto c = *curPtr;

	// Skip consecutive spaces.
	while(true) {
	if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
	// if we have something other than whitespace, we're done.
	break;
	}

	c = *++curPtr;
	}

	// If the client wants us to return whitespace, return it now.
	if(m_keepWhitespace) {
	formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
	return true;
	}

	m_bufferPtr = curPtr;
	return false;
	}
	// ----------------------------------------------------------------------------
	bool Lexer::skipLineComment(Token& result, const char* curPtr)
	{
	// Scan over the body of the comment. The common case, when scanning, is that
	// the comment contains normal ascii characters with nothing interesting in
	// them. As such, optimize for this case with the inner loop.
	//
	// This loop terminates with CurPtr pointing at the newline (or end of buffer)
	// character that ends the line comment.
	char C;
	while(true) {
	C = *curPtr;
	// Skip over characters in the fast loop.
	while(C != 0 && // Potentially EOF.
	C != '\n' && C != '\r') // Newline or DOS-style newline.
	C = *++curPtr;

	break;
	}

	// If we are returning comments as tokens, return this comment as a token.
	if(m_keepComments) {
	formTokenWithChars(result, curPtr, Token::Kind::Comment);
	return true;
	}

	// Otherwise, eat the \n character. We don't care if this is a \n\r or
	// \r\n sequence. This is an efficiency hack (because we know the \n can't
	// contribute to another token), it isn't needed for correctness. Note that
	// this is ok even in KeepWhitespaceMode, because we would have returned the
	// comment above in that mode.
	++curPtr;

	m_bufferPtr = curPtr;
	return false;
	}
	// ----------------------------------------------------------------------------
	bool Lexer::lexIdentifier(Token& result, const char* curPtr)
	{
	// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
	unsigned char C = *curPtr++;
	while((C >= 'A' && C <= 'Z') \|\| (C >= 'a' && C <= 'z') \|\|
	(C >= '0' && C <= '9') \|\| C == '_')
	C = *curPtr++;

	--curPtr; // Back up over the skipped character.

	formTokenWithChars(result, curPtr, Token::Kind::Identifier);

	return true;
	}
	// ----------------------------------------------------------------------------
	bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
	{
	(void)result;
	(void)curPtr;
	throw std::exception("Not implemented yet");
	}
	// ----------------------------------------------------------------------------
	bool Lexer::lexStringLiteral(Token& result,
	const char* curPtr,
	bool isSingleQuoted)
	{
	auto const quote = isSingleQuoted ? '\'' : '"';

	char c = getCharAndAdvance(curPtr);
	while(c != quote) {
	// Skip escaped characters.
	if(c == '\\')
	c = getCharAndAdvance(curPtr);

	if(c == '\n' \|\| c == '\r' \|\|
	(c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
	formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
	return true;
	}

	c = getCharAndAdvance(curPtr);
	}

	formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
	return true;
	}
	// ----------------------------------------------------------------------------
	void Lexer::formTokenWithChars(Token& result,
	const char* tokEnd,
	Token::Kind kind)
	{
	auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);

	result.setKind(kind);
	result.setLength(tokLen);
	result.setLocation(getSourceLocation(m_bufferPtr));
	result.setIdentifier(m_bufferPtr, tokLen);

	m_bufferPtr = tokEnd;
	}
	SourceLocation Lexer::getSourceLocation(const char* ptr)
	{
	auto loc = SourceLocation{};
	auto curPtr = m_bufferStart;

	loc.line = 1;
	while(curPtr < ptr) {
	if(isVerticalWhitespace(*curPtr++)) {
	loc.line++;
	loc.column = 0;
	} else {
	loc.column++;
	}
	}

	return loc;
	}
	// ----------------------------------------------------------------------------
	bool Lexer::lexToken(Token& result)
	{
	LexNextToken:
	const char* curPtr = m_bufferPtr;

	if(curPtr > m_bufferEnd)
	return false;

	// Small amounts of horizontal whitespace are very common between tokens.
	if((curPtr == ' ') \|\| (curPtr == '\t')) {
	++curPtr;
	while((curPtr == ' ') \|\| (curPtr == '\t'))
	++curPtr;

	if(m_keepWhitespace) {
	// If the user wants to keep whitespaces
	// form a new token and return
	formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
	return true;
	}

	m_bufferPtr = curPtr;
	}

	auto c = getCharAndAdvance(curPtr);
	auto kind = Token::Kind::Unknown;

	// Lex identifiers and constants
	if(c == '_' \|\| (c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z'))
	return lexIdentifier(result, curPtr);
	if(c >= '0' && c <= '9')
	return lexNumericConstant(result, curPtr);

	switch(c) {
	case '\0':
	// Found EOF?
	if(curPtr - 1 == m_bufferEnd) {
	formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
	return true;
	}

	if(skipWhitespace(result, curPtr))
	return true;

	goto LexNextToken;
	case '\r':
	if(*curPtr == '\n')
	c = getCharAndAdvance(curPtr);
	[[fallthrough]];
	case '\n':
	if(skipWhitespace(result, curPtr))
	return true;

	// We only saw whitespace, so just try again
	goto LexNextToken;
	case ' ':
	case '\t':
	case '\f':
	case '\v':
	SkipHorizontalWhitespace:
	if(skipWhitespace(result, curPtr))
	return true;
	SkipIgnoredUnits:
	curPtr = m_bufferPtr;
	// If the next token is obviously a // or /* */ comment, skip it
	// efficiently too (without going through the big switch stmt).
	if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
	if(skipLineComment(result, curPtr + 2))
	return true;
	goto SkipIgnoredUnits;
	} else if(isHorizontalWhitespace(*curPtr)) {
	goto SkipHorizontalWhitespace;
	}
	// We only saw whitespace, so just try again
	// (We manually eliminate the tail call to avoid recursion.)
	goto LexNextToken;
	case '\'':
	return lexStringLiteral(result, curPtr, true);
	case '"':
	return lexStringLiteral(result, curPtr, false);
	case '?':
	kind = Token::Kind::Question;
	break;
	case '[':
	kind = Token::Kind::LeftSquare;
	break;
	case ']':
	kind = Token::Kind::RightSquare;
	break;
	case '(':
	kind = Token::Kind::LeftParen;
	break;
	case ')':
	kind = Token::Kind::RightParen;
	break;
	case '{':
	kind = Token::Kind::LeftCurly;
	break;
	case '}':
	kind = Token::Kind::RightCurly;
	break;
	case '.': {
	auto const next = *curPtr;
	if(next >= '0' && next <= '9') {
	return lexNumericConstant(result, curPtr);
	} else {
	kind = Token::Kind::Period;
	}
	}
	case '*':
	kind = Token::Kind::Star;
	break;
	case '+':
	kind = Token::Kind::Plus;
	break;
	case '-':
	kind = Token::Kind::Minus;
	break;
	case '!':
	kind = Token::Kind::Exclam;
	break;
	case '/':
	if(*curPtr == '/') {
	if(skipLineComment(result, curPtr + 1))
	return true; // There is a token to return.

	// It is common for the tokens immediately after a // comment to be
	// whitespace (indentation for the next line). Instead of going through
	// the big switch, handle it efficiently now.
	goto SkipIgnoredUnits;
	}
	kind = Token::Kind::Slash;
	break;
	case '>':
	kind = Token::Kind::Greater;
	break;
	case ':':
	kind = Token::Kind::Colon;
	break;
	case ';':
	kind = Token::Kind::SemiColon;
	break;
	case ',':
	kind = Token::Kind::Comma;
	break;
	case '#':
	kind = Token::Kind::Hash;
	break;
	default:
	kind = Token::Kind::Unknown;
	break;
	}
	formTokenWithChars(result, curPtr, kind);
	return true;
	}
	// ----------------------------------------------------------------------------
	std::vector<Token> Lexer::lexAllTokens()
	{
	auto result = std::vector<Token>{};
	auto token = Token{};
	while(lexToken(token)) {
	result.emplace_back(std::move(token));
	}
	return result;
	}
	// ----------------------------------------------------------------------------
	} // namespace detail
	} // namespace sage
	#pragma once

	#include <ostream>
	#include <string>
	#include <vector>

	#include "Token.hpp"

	namespace sage
	{
	namespace detail
	{
	class Lexer
	{
	public:
	Lexer(std::string input);

	bool lexToken(Token& result);
	std::vector<Token> lexAllTokens();

	void keepWhitespace(bool keep) { m_keepWhitespace = keep; }
	void keepComments(bool keep) { m_keepComments = keep; }

	private:
	bool skipWhitespace(Token& result, const char* curPtr);
	bool skipLineComment(Token& result, const char* curPtr);
	bool lexIdentifier(Token& result, const char* curPtr);
	bool lexNumericConstant(Token& result, const char* curPtr);
	bool lexStringLiteral(Token& result, const char* curPtr, bool isSingleQuoted);

	void formTokenWithChars(Token& result, const char* tokEnd, Token::Kind kind);
	SourceLocation getSourceLocation(const char* ptr);

	std::string m_input;
	const char* m_bufferStart;
	const char* m_bufferEnd;
	const char* m_bufferPtr;

	bool m_keepComments{true};
	bool m_keepWhitespace{false};
	};
	} // namespace detail
	} // namespace sage