arrieta · October 27, 2018 15:56
diff --git a/pds.cpp b/pds.cpp
 #include <cctype>
 #include <fstream>
 #include <iostream>
 #include <iterator>
 #include <sstream>
 #include <stdexcept>
 #include <string>

 inline bool semantic_compare(std::size_t n, const char *value,
                             const std::string &s) {
  for (std::size_t index = 0; index < n; ++index) {
    if (*(value + index) != std::tolower(s[index]))
      return false;
  }
  return true;
 }

 class Token {

 public:
  enum class Kind {
    END_OF_STREAM,
    NAME,
    PLUS,
    MINUS,
    ASTERISK,
    POWER,
    SOLIDUS,
    EQUAL,
    GT,
    LT,
    COLON,
    DOT,
    CIRCUMFLEX,
    POUND,
    UNDERSCORE,
    LPAR,
    RPAR,
    LCURLY,
    RCURLY,
    COMMA,
    OPEN_OBJECT,
    CLOSE_OBJECT,
    OPEN_GROUP,
    CLOSE_GROUP,
    END_LABEL,
    QUOTED_TEXT,
    QUOTED_SYMBOL,
    UNSIGNED_INT,
    UNSIGNED_REAL,
    COMMENT,
  };

  Token(Kind kind = Token::Kind::END_OF_STREAM) : m_kind{kind}, m_lexeme{} {}

  bool is(Kind kind) const { return m_kind == kind; }

  Kind kind() const { return m_kind; }

  const std::string &lexeme() const { return m_lexeme; }

  void set_kind(Kind kind) { m_kind = kind; }

  void extend(char c) { m_lexeme.push_back(c); }

  void clear() { m_lexeme.clear(); }

  void reset(Token::Kind kind) {
    set_kind(kind);
    clear();
  }

  void maybe_keyword() {
    switch (m_lexeme.size()) {
    case 3:
      if (semantic_compare(3, "end", m_lexeme))
        reset(Token::Kind::END_LABEL);
      break;
    case 5:
      if (semantic_compare(5, "group", m_lexeme))
        reset(Token::Kind::OPEN_GROUP);
      break;
    case 6:
      if (semantic_compare(6, "object", m_lexeme))
        reset(Token::Kind::OPEN_OBJECT);
      break;
    case 9:
      if (semantic_compare(9, "end_group", m_lexeme))
        reset(Token::Kind::CLOSE_GROUP);
      break;
    case 10:
      if (semantic_compare(10, "end_object", m_lexeme))
        reset(Token::Kind::CLOSE_OBJECT);
      break;
    default:
      return;
    }
  }

 private:
  Kind m_kind;
  std::string m_lexeme;
 };

 std::ostream &operator<<(std::ostream &os, const Token &t) {
  static constexpr const char *const TOKEN_KIND_NAMES[]{"END_OF_STREAM",
                                                        "NAME",
                                                        "PLUS",
                                                        "MINUS",
                                                        "ASTERISK",
                                                        "POWER",
                                                        "SOLIDUS",
                                                        "EQUAL",
                                                        "GT",
                                                        "LT",
                                                        "COLON",
                                                        "DOT",
                                                        "CIRCUMFLEX",
                                                        "POUND",
                                                        "UNDERSCORE",
                                                        "LPAR",
                                                        "RPAR",
                                                        "LCURLY",
                                                        "RCURLY",
                                                        "COMMA",
                                                        "OPEN_OBJECT",
                                                        "CLOSE_OBJECT",
                                                        "OPEN_GROUP",
                                                        "CLOSE_GROUP",
                                                        "END_LABEL",
                                                        "QUOTED_TEXT",
                                                        "QUOTED_SYMBOL",
                                                        "UNSIGNED_INT",
                                                        "UNSIGNED_REAL",
                                                        "COMMENT"};

  return os << "{" << TOKEN_KIND_NAMES[static_cast<int>(t.kind())] << ", "
            << t.lexeme() << "}";
 }

 class token_iterator : public std::iterator<std::input_iterator_tag, Token> {

 public:
  using code_point_t = typename std::istream::char_type;
  using value_type = Token;
  using reference = const Token &;
  using pointer = const Token *;

  token_iterator(std::istream &is) : m_is{std::addressof(is)}, m_token{} {
    find_next_token();
  }

  token_iterator() : m_is{nullptr}, m_token{} {}

  reference operator*() const { return m_token; }

  pointer operator->() const { return &(operator*()); }

  token_iterator &operator++() {
    find_next_token();
    return *this;
  }

  token_iterator operator++(int) {
    token_iterator temp = *this;
    ++(*this);
    return temp;
  }

  friend bool operator==(const token_iterator &lhs, const token_iterator &rhs) {
    return lhs.m_is == rhs.m_is;
  }

  friend bool operator!=(const token_iterator &lhs, const token_iterator &rhs) {
    return not(lhs == rhs);
  }

 private:
  void unexpected(code_point_t c, const char *when = "while lexing input") {
    std::ostringstream os;
    if (c == EOF) {
      os << "found unexpected end of file ";
    } else {
      os << "found unexpected character '" << c << "' ";
    }
    os << when << ".";

    throw std::runtime_error(os.str());
  }

  code_point_t get() { return m_is->get(); }

  code_point_t peek() { return m_is->peek(); }

  void unget() { m_is->unget(); }

  void yield_atom(Token::Kind kind) {
    get();
    m_token.reset(kind);
  }

  void finalize() {
    yield_atom(Token::Kind::END_OF_STREAM);
    m_is = nullptr;
  }

  void skip_whitespace() {
    while (std::isspace(peek()))
      get();
  }

  void find_next_token() {
    skip_whitespace();
    switch (code_point_t c = peek()) {
    case EOF:
      return finalize();
    case '+':
      return yield_atom(Token::Kind::PLUS);
    case '-':
      return yield_atom(Token::Kind::MINUS);
    case '=':
      return yield_atom(Token::Kind::EQUAL);
    case '^':
      return yield_atom(Token::Kind::CIRCUMFLEX);
    case '#':
      return yield_atom(Token::Kind::POUND);
    case '_':
      return yield_atom(Token::Kind::UNDERSCORE);
    case '>':
      return yield_atom(Token::Kind::GT);
    case '<':
      return yield_atom(Token::Kind::LT);
    case ':':
      return yield_atom(Token::Kind::COLON);
    case '(':
      return yield_atom(Token::Kind::LPAR);
    case ')':
      return yield_atom(Token::Kind::RPAR);
    case '{':
      return yield_atom(Token::Kind::LCURLY);
    case '}':
      return yield_atom(Token::Kind::RCURLY);
    case ',':
      return yield_atom(Token::Kind::COMMA);
    default:
      return make_nonterminal(c);
    }
  }

  void make_nonterminal(code_point_t c) {
    switch (c) {
    case '\'':
      return make_quoted(c);
    case '"':
      return make_quoted(c);
    case '.':
      return make_unsigned_real_or_only_dot();
    case '*':
      return make_asterisk_or_power();
    case '/':
      return make_comment_or_solidus();

    default:
      if (std::isdigit(c))
        return make_number_from_digit();
      if (std::isalpha(c))
        return make_name_or_keyword();
    }
    unexpected(c);
  }

  void make_comment_or_solidus() {
    // todo: the spec does not allow for multi-line comments, but I do. Shoudl I
    // stick to the spec?

    // at this point we have a solidus which may or may not be a comment...
    // let's see

    m_token.reset(Token::Kind::SOLIDUS);
    get(); // eat the solidus

    // return quickly of we don't have a comment
    if (peek() != '*')
      return;

    // ok.. this is a comment. We must find the comment terminator or fail
    // miserably
    m_token.set_kind(Token::Kind::COMMENT);
    get(); // eat the asterisk

    while (true) {

      if (peek() == '*') {
        get();
        if (peek() == '/') {
          get();
          return;
        } else {
          m_token.extend('*');
        }
      } else if (peek() == EOF) {
        unexpected(EOF, "while reading a comment");
      } else {
        m_token.extend(get());
      }
    }
  }

  void make_asterisk_or_power() {
    m_token.reset(Token::Kind::ASTERISK);
    get(); // eat the asterisk
    if (peek() == '*') {
      get(); // eat the second asterisk and make a "power" (**)
      m_token.set_kind(Token::Kind::POWER);
    }
  }
  void make_quoted(code_point_t c) {
    get(); // eat the leading quote
    m_token.reset(c == '"' ? Token::Kind::QUOTED_TEXT
                           : Token::Kind::QUOTED_SYMBOL);
    // find closing quote
    while (peek() != c) {
      if (peek() == EOF)
        unexpected(EOF, "while expecting to find a closing quote");
      m_token.extend(get());
    }
    get(); // eat the trailing quote
  }

  void make_unsigned_int() {
    m_token.reset(Token::Kind::UNSIGNED_INT);
    while (std::isdigit(peek()))
      m_token.extend(get());
  }

  void append_exponent() {
    m_token.extend(get()); // eat the exponent

    // eat the optional sign
    if ((peek() == '+') or (peek() == '-')) {
      m_token.extend(get());
    }

    // now we *must* find the exponent value of fail miserably
    if (std::isdigit(peek())) {
      while (std::isdigit(peek()))
        m_token.extend(get());
    } else {
      unexpected(peek(),
                 "while expecting to find an exponent for a real number");
    }
  }

  void make_unsigned_real_or_only_dot() {
    // Works when we are given a dot and wonder whether is denotes a real (by
    // being followed by digits and optional exponent) or it's really just a dot
    m_token.reset(Token::Kind::DOT);
    get(); // eat the dot

    if (std::isdigit(peek())) {
      // it is a real
      m_token.set_kind(Token::Kind::UNSIGNED_REAL);
      m_token.extend('.');
      while (std::isdigit(peek()))
        m_token.extend(get());
      // add (optional) exponent
      if ((peek() == 'e') or (peek() == 'E'))
        append_exponent();
    }
  }

  void make_number_from_digit() {
    // Works on the case when we have an unsigned integer which may or may not
    // turn out to be a real by virtue of containing either one or both a
    // decimal part and exponent.

    // start by assuming we are making an unsigned integer
    make_unsigned_int();

    // if the next character is a dot, we are now dealing with a real
    if (peek() == '.') {
      m_token.set_kind(Token::Kind::UNSIGNED_REAL);
      m_token.extend(get());
      // add (optional) decimals
      while (std::isdigit(peek()))
        m_token.extend(get());
      // add (optional) exponent
      if (peek() == 'e' or peek() == 'E')
        append_exponent();
    } else if (peek() == 'e' or peek() == 'E') {
      // the next character tells us we are dealing with an scaled real
      m_token.set_kind(Token::Kind::UNSIGNED_REAL);
      append_exponent();
    } else {
      // leave the poor unsigned in peace...
    }
  }

  void make_name_or_keyword() {
    m_token.reset(Token::Kind::NAME);

    // The following complexity stems from the syntactical name requirements:
    // (1) they may contain underscores in the middle, but (2) they cannot be
    // consecutive, and (3) they cannot be at the end.
    while (std::isalnum(peek()) or peek() == '_') {
      if (peek() == '_') {
        get();
        if (std::isalnum(peek())) {
          m_token.extend('_');
          continue;
        } else {
          unget();
          break;
        }
      } else {
        m_token.extend(get());
      }
    }
    // Now we certainly have a name, but it may be a keyword (such as
    // "end_object") --- we ask the token can adjust itself (that is: mark
    // itself as keyword instead of name if it indeed contains a keyword)
    m_token.maybe_keyword();
  }

  std::istream *m_is;
  value_type m_token;
 };

 int main(int argc, char *argv[]) {
  if (argc != 2) {
    std::cerr << "usage: " << argv[0] << " sample.txt\n";
    std::exit(0);
  }

  try {
    std::ifstream fp(argv[1]);
    if (not fp) {
      throw std::runtime_error("error");
    }
    for (auto it = token_iterator(fp); it != token_iterator(); ++it) {
      // std::cout << *it << "\n";
    }
    std::cout << "Finished\n";
  } catch (const std::exception &e) {
    std::cerr << "[fatal] " << e.what() << "\n";
  }
 }
	#include <cctype>
	#include <fstream>
	#include <iostream>
	#include <iterator>
	#include <sstream>
	#include <stdexcept>
	#include <string>

	inline bool semantic_compare(std::size_t n, const char *value,
	const std::string &s) {
	for (std::size_t index = 0; index < n; ++index) {
	if (*(value + index) != std::tolower(s[index]))
	return false;
	}
	return true;
	}

	class Token {

	public:
	enum class Kind {
	END_OF_STREAM,
	NAME,
	PLUS,
	MINUS,
	ASTERISK,
	POWER,
	SOLIDUS,
	EQUAL,
	GT,
	LT,
	COLON,
	DOT,
	CIRCUMFLEX,
	POUND,
	UNDERSCORE,
	LPAR,
	RPAR,
	LCURLY,
	RCURLY,
	COMMA,
	OPEN_OBJECT,
	CLOSE_OBJECT,
	OPEN_GROUP,
	CLOSE_GROUP,
	END_LABEL,
	QUOTED_TEXT,
	QUOTED_SYMBOL,
	UNSIGNED_INT,
	UNSIGNED_REAL,
	COMMENT,
	};

	Token(Kind kind = Token::Kind::END_OF_STREAM) : m_kind{kind}, m_lexeme{} {}

	bool is(Kind kind) const { return m_kind == kind; }

	Kind kind() const { return m_kind; }

	const std::string &lexeme() const { return m_lexeme; }

	void set_kind(Kind kind) { m_kind = kind; }

	void extend(char c) { m_lexeme.push_back(c); }

	void clear() { m_lexeme.clear(); }

	void reset(Token::Kind kind) {
	set_kind(kind);
	clear();
	}

	void maybe_keyword() {
	switch (m_lexeme.size()) {
	case 3:
	if (semantic_compare(3, "end", m_lexeme))
	reset(Token::Kind::END_LABEL);
	break;
	case 5:
	if (semantic_compare(5, "group", m_lexeme))
	reset(Token::Kind::OPEN_GROUP);
	break;
	case 6:
	if (semantic_compare(6, "object", m_lexeme))
	reset(Token::Kind::OPEN_OBJECT);
	break;
	case 9:
	if (semantic_compare(9, "end_group", m_lexeme))
	reset(Token::Kind::CLOSE_GROUP);
	break;
	case 10:
	if (semantic_compare(10, "end_object", m_lexeme))
	reset(Token::Kind::CLOSE_OBJECT);
	break;
	default:
	return;
	}
	}

	private:
	Kind m_kind;
	std::string m_lexeme;
	};

	std::ostream &operator<<(std::ostream &os, const Token &t) {
	static constexpr const char *const TOKEN_KIND_NAMES[]{"END_OF_STREAM",
	"NAME",
	"PLUS",
	"MINUS",
	"ASTERISK",
	"POWER",
	"SOLIDUS",
	"EQUAL",
	"GT",
	"LT",
	"COLON",
	"DOT",
	"CIRCUMFLEX",
	"POUND",
	"UNDERSCORE",
	"LPAR",
	"RPAR",
	"LCURLY",
	"RCURLY",
	"COMMA",
	"OPEN_OBJECT",
	"CLOSE_OBJECT",
	"OPEN_GROUP",
	"CLOSE_GROUP",
	"END_LABEL",
	"QUOTED_TEXT",
	"QUOTED_SYMBOL",
	"UNSIGNED_INT",
	"UNSIGNED_REAL",
	"COMMENT"};

	return os << "{" << TOKEN_KIND_NAMES[static_cast<int>(t.kind())] << ", "
	<< t.lexeme() << "}";
	}

	class token_iterator : public std::iterator<std::input_iterator_tag, Token> {

	public:
	using code_point_t = typename std::istream::char_type;
	using value_type = Token;
	using reference = const Token &;
	using pointer = const Token *;

	token_iterator(std::istream &is) : m_is{std::addressof(is)}, m_token{} {
	find_next_token();
	}

	token_iterator() : m_is{nullptr}, m_token{} {}

	reference operator*() const { return m_token; }

	pointer operator->() const { return &(operator*()); }

	token_iterator &operator++() {
	find_next_token();
	return *this;
	}

	token_iterator operator++(int) {
	token_iterator temp = *this;
	++(*this);
	return temp;
	}

	friend bool operator==(const token_iterator &lhs, const token_iterator &rhs) {
	return lhs.m_is == rhs.m_is;
	}

	friend bool operator!=(const token_iterator &lhs, const token_iterator &rhs) {
	return not(lhs == rhs);
	}

	private:
	void unexpected(code_point_t c, const char *when = "while lexing input") {
	std::ostringstream os;
	if (c == EOF) {
	os << "found unexpected end of file ";
	} else {
	os << "found unexpected character '" << c << "' ";
	}
	os << when << ".";

	throw std::runtime_error(os.str());
	}

	code_point_t get() { return m_is->get(); }

	code_point_t peek() { return m_is->peek(); }

	void unget() { m_is->unget(); }

	void yield_atom(Token::Kind kind) {
	get();
	m_token.reset(kind);
	}

	void finalize() {
	yield_atom(Token::Kind::END_OF_STREAM);
	m_is = nullptr;
	}

	void skip_whitespace() {
	while (std::isspace(peek()))
	get();
	}

	void find_next_token() {
	skip_whitespace();
	switch (code_point_t c = peek()) {
	case EOF:
	return finalize();
	case '+':
	return yield_atom(Token::Kind::PLUS);
	case '-':
	return yield_atom(Token::Kind::MINUS);
	case '=':
	return yield_atom(Token::Kind::EQUAL);
	case '^':
	return yield_atom(Token::Kind::CIRCUMFLEX);
	case '#':
	return yield_atom(Token::Kind::POUND);
	case '_':
	return yield_atom(Token::Kind::UNDERSCORE);
	case '>':
	return yield_atom(Token::Kind::GT);
	case '<':
	return yield_atom(Token::Kind::LT);
	case ':':
	return yield_atom(Token::Kind::COLON);
	case '(':
	return yield_atom(Token::Kind::LPAR);
	case ')':
	return yield_atom(Token::Kind::RPAR);
	case '{':
	return yield_atom(Token::Kind::LCURLY);
	case '}':
	return yield_atom(Token::Kind::RCURLY);
	case ',':
	return yield_atom(Token::Kind::COMMA);
	default:
	return make_nonterminal(c);
	}
	}

	void make_nonterminal(code_point_t c) {
	switch (c) {
	case '\'':
	return make_quoted(c);
	case '"':
	return make_quoted(c);
	case '.':
	return make_unsigned_real_or_only_dot();
	case '*':
	return make_asterisk_or_power();
	case '/':
	return make_comment_or_solidus();

	default:
	if (std::isdigit(c))
	return make_number_from_digit();
	if (std::isalpha(c))
	return make_name_or_keyword();
	}
	unexpected(c);
	}

	void make_comment_or_solidus() {
	// todo: the spec does not allow for multi-line comments, but I do. Shoudl I
	// stick to the spec?

	// at this point we have a solidus which may or may not be a comment...
	// let's see

	m_token.reset(Token::Kind::SOLIDUS);
	get(); // eat the solidus

	// return quickly of we don't have a comment
	if (peek() != '*')
	return;

	// ok.. this is a comment. We must find the comment terminator or fail
	// miserably
	m_token.set_kind(Token::Kind::COMMENT);
	get(); // eat the asterisk

	while (true) {

	if (peek() == '*') {
	get();
	if (peek() == '/') {
	get();
	return;
	} else {
	m_token.extend('*');
	}
	} else if (peek() == EOF) {
	unexpected(EOF, "while reading a comment");
	} else {
	m_token.extend(get());
	}
	}
	}

	void make_asterisk_or_power() {
	m_token.reset(Token::Kind::ASTERISK);
	get(); // eat the asterisk
	if (peek() == '*') {
	get(); // eat the second asterisk and make a "power" (**)
	m_token.set_kind(Token::Kind::POWER);
	}
	}
	void make_quoted(code_point_t c) {
	get(); // eat the leading quote
	m_token.reset(c == '"' ? Token::Kind::QUOTED_TEXT
	: Token::Kind::QUOTED_SYMBOL);
	// find closing quote
	while (peek() != c) {
	if (peek() == EOF)
	unexpected(EOF, "while expecting to find a closing quote");
	m_token.extend(get());
	}
	get(); // eat the trailing quote
	}

	void make_unsigned_int() {
	m_token.reset(Token::Kind::UNSIGNED_INT);
	while (std::isdigit(peek()))
	m_token.extend(get());
	}

	void append_exponent() {
	m_token.extend(get()); // eat the exponent

	// eat the optional sign
	if ((peek() == '+') or (peek() == '-')) {
	m_token.extend(get());
	}

	// now we must find the exponent value of fail miserably
	if (std::isdigit(peek())) {
	while (std::isdigit(peek()))
	m_token.extend(get());
	} else {
	unexpected(peek(),
	"while expecting to find an exponent for a real number");
	}
	}

	void make_unsigned_real_or_only_dot() {
	// Works when we are given a dot and wonder whether is denotes a real (by
	// being followed by digits and optional exponent) or it's really just a dot
	m_token.reset(Token::Kind::DOT);
	get(); // eat the dot

	if (std::isdigit(peek())) {
	// it is a real
	m_token.set_kind(Token::Kind::UNSIGNED_REAL);
	m_token.extend('.');
	while (std::isdigit(peek()))
	m_token.extend(get());
	// add (optional) exponent
	if ((peek() == 'e') or (peek() == 'E'))
	append_exponent();
	}
	}

	void make_number_from_digit() {
	// Works on the case when we have an unsigned integer which may or may not
	// turn out to be a real by virtue of containing either one or both a
	// decimal part and exponent.

	// start by assuming we are making an unsigned integer
	make_unsigned_int();

	// if the next character is a dot, we are now dealing with a real
	if (peek() == '.') {
	m_token.set_kind(Token::Kind::UNSIGNED_REAL);
	m_token.extend(get());
	// add (optional) decimals
	while (std::isdigit(peek()))
	m_token.extend(get());
	// add (optional) exponent
	if (peek() == 'e' or peek() == 'E')
	append_exponent();
	} else if (peek() == 'e' or peek() == 'E') {
	// the next character tells us we are dealing with an scaled real
	m_token.set_kind(Token::Kind::UNSIGNED_REAL);
	append_exponent();
	} else {
	// leave the poor unsigned in peace...
	}
	}

	void make_name_or_keyword() {
	m_token.reset(Token::Kind::NAME);

	// The following complexity stems from the syntactical name requirements:
	// (1) they may contain underscores in the middle, but (2) they cannot be
	// consecutive, and (3) they cannot be at the end.
	while (std::isalnum(peek()) or peek() == '_') {
	if (peek() == '_') {
	get();
	if (std::isalnum(peek())) {
	m_token.extend('_');
	continue;
	} else {
	unget();
	break;
	}
	} else {
	m_token.extend(get());
	}
	}
	// Now we certainly have a name, but it may be a keyword (such as
	// "end_object") --- we ask the token can adjust itself (that is: mark
	// itself as keyword instead of name if it indeed contains a keyword)
	m_token.maybe_keyword();
	}

	std::istream *m_is;
	value_type m_token;
	};

	int main(int argc, char *argv[]) {
	if (argc != 2) {
	std::cerr << "usage: " << argv[0] << " sample.txt\n";
	std::exit(0);
	}

	try {
	std::ifstream fp(argv[1]);
	if (not fp) {
	throw std::runtime_error("error");
	}
	for (auto it = token_iterator(fp); it != token_iterator(); ++it) {
	// std::cout << *it << "\n";
	}
	std::cout << "Finished\n";
	} catch (const std::exception &e) {
	std::cerr << "[fatal] " << e.what() << "\n";
	}
	}