Skip to content

Instantly share code, notes, and snippets.

@arrieta
Created October 27, 2018 15:56
Show Gist options
  • Save arrieta/ce2c9293fed5760a09fa026b480dbdce to your computer and use it in GitHub Desktop.
Save arrieta/ce2c9293fed5760a09fa026b480dbdce to your computer and use it in GitHub Desktop.
PDS OBJ Parser Excercise
#include <cctype>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <stdexcept>
#include <string>
inline bool semantic_compare(std::size_t n, const char *value,
const std::string &s) {
for (std::size_t index = 0; index < n; ++index) {
if (*(value + index) != std::tolower(s[index]))
return false;
}
return true;
}
class Token {
public:
enum class Kind {
END_OF_STREAM,
NAME,
PLUS,
MINUS,
ASTERISK,
POWER,
SOLIDUS,
EQUAL,
GT,
LT,
COLON,
DOT,
CIRCUMFLEX,
POUND,
UNDERSCORE,
LPAR,
RPAR,
LCURLY,
RCURLY,
COMMA,
OPEN_OBJECT,
CLOSE_OBJECT,
OPEN_GROUP,
CLOSE_GROUP,
END_LABEL,
QUOTED_TEXT,
QUOTED_SYMBOL,
UNSIGNED_INT,
UNSIGNED_REAL,
COMMENT,
};
Token(Kind kind = Token::Kind::END_OF_STREAM) : m_kind{kind}, m_lexeme{} {}
bool is(Kind kind) const { return m_kind == kind; }
Kind kind() const { return m_kind; }
const std::string &lexeme() const { return m_lexeme; }
void set_kind(Kind kind) { m_kind = kind; }
void extend(char c) { m_lexeme.push_back(c); }
void clear() { m_lexeme.clear(); }
void reset(Token::Kind kind) {
set_kind(kind);
clear();
}
void maybe_keyword() {
switch (m_lexeme.size()) {
case 3:
if (semantic_compare(3, "end", m_lexeme))
reset(Token::Kind::END_LABEL);
break;
case 5:
if (semantic_compare(5, "group", m_lexeme))
reset(Token::Kind::OPEN_GROUP);
break;
case 6:
if (semantic_compare(6, "object", m_lexeme))
reset(Token::Kind::OPEN_OBJECT);
break;
case 9:
if (semantic_compare(9, "end_group", m_lexeme))
reset(Token::Kind::CLOSE_GROUP);
break;
case 10:
if (semantic_compare(10, "end_object", m_lexeme))
reset(Token::Kind::CLOSE_OBJECT);
break;
default:
return;
}
}
private:
Kind m_kind;
std::string m_lexeme;
};
std::ostream &operator<<(std::ostream &os, const Token &t) {
static constexpr const char *const TOKEN_KIND_NAMES[]{"END_OF_STREAM",
"NAME",
"PLUS",
"MINUS",
"ASTERISK",
"POWER",
"SOLIDUS",
"EQUAL",
"GT",
"LT",
"COLON",
"DOT",
"CIRCUMFLEX",
"POUND",
"UNDERSCORE",
"LPAR",
"RPAR",
"LCURLY",
"RCURLY",
"COMMA",
"OPEN_OBJECT",
"CLOSE_OBJECT",
"OPEN_GROUP",
"CLOSE_GROUP",
"END_LABEL",
"QUOTED_TEXT",
"QUOTED_SYMBOL",
"UNSIGNED_INT",
"UNSIGNED_REAL",
"COMMENT"};
return os << "{" << TOKEN_KIND_NAMES[static_cast<int>(t.kind())] << ", "
<< t.lexeme() << "}";
}
class token_iterator : public std::iterator<std::input_iterator_tag, Token> {
public:
using code_point_t = typename std::istream::char_type;
using value_type = Token;
using reference = const Token &;
using pointer = const Token *;
token_iterator(std::istream &is) : m_is{std::addressof(is)}, m_token{} {
find_next_token();
}
token_iterator() : m_is{nullptr}, m_token{} {}
reference operator*() const { return m_token; }
pointer operator->() const { return &(operator*()); }
token_iterator &operator++() {
find_next_token();
return *this;
}
token_iterator operator++(int) {
token_iterator temp = *this;
++(*this);
return temp;
}
friend bool operator==(const token_iterator &lhs, const token_iterator &rhs) {
return lhs.m_is == rhs.m_is;
}
friend bool operator!=(const token_iterator &lhs, const token_iterator &rhs) {
return not(lhs == rhs);
}
private:
void unexpected(code_point_t c, const char *when = "while lexing input") {
std::ostringstream os;
if (c == EOF) {
os << "found unexpected end of file ";
} else {
os << "found unexpected character '" << c << "' ";
}
os << when << ".";
throw std::runtime_error(os.str());
}
code_point_t get() { return m_is->get(); }
code_point_t peek() { return m_is->peek(); }
void unget() { m_is->unget(); }
void yield_atom(Token::Kind kind) {
get();
m_token.reset(kind);
}
void finalize() {
yield_atom(Token::Kind::END_OF_STREAM);
m_is = nullptr;
}
void skip_whitespace() {
while (std::isspace(peek()))
get();
}
void find_next_token() {
skip_whitespace();
switch (code_point_t c = peek()) {
case EOF:
return finalize();
case '+':
return yield_atom(Token::Kind::PLUS);
case '-':
return yield_atom(Token::Kind::MINUS);
case '=':
return yield_atom(Token::Kind::EQUAL);
case '^':
return yield_atom(Token::Kind::CIRCUMFLEX);
case '#':
return yield_atom(Token::Kind::POUND);
case '_':
return yield_atom(Token::Kind::UNDERSCORE);
case '>':
return yield_atom(Token::Kind::GT);
case '<':
return yield_atom(Token::Kind::LT);
case ':':
return yield_atom(Token::Kind::COLON);
case '(':
return yield_atom(Token::Kind::LPAR);
case ')':
return yield_atom(Token::Kind::RPAR);
case '{':
return yield_atom(Token::Kind::LCURLY);
case '}':
return yield_atom(Token::Kind::RCURLY);
case ',':
return yield_atom(Token::Kind::COMMA);
default:
return make_nonterminal(c);
}
}
void make_nonterminal(code_point_t c) {
switch (c) {
case '\'':
return make_quoted(c);
case '"':
return make_quoted(c);
case '.':
return make_unsigned_real_or_only_dot();
case '*':
return make_asterisk_or_power();
case '/':
return make_comment_or_solidus();
default:
if (std::isdigit(c))
return make_number_from_digit();
if (std::isalpha(c))
return make_name_or_keyword();
}
unexpected(c);
}
void make_comment_or_solidus() {
// todo: the spec does not allow for multi-line comments, but I do. Shoudl I
// stick to the spec?
// at this point we have a solidus which may or may not be a comment...
// let's see
m_token.reset(Token::Kind::SOLIDUS);
get(); // eat the solidus
// return quickly of we don't have a comment
if (peek() != '*')
return;
// ok.. this is a comment. We must find the comment terminator or fail
// miserably
m_token.set_kind(Token::Kind::COMMENT);
get(); // eat the asterisk
while (true) {
if (peek() == '*') {
get();
if (peek() == '/') {
get();
return;
} else {
m_token.extend('*');
}
} else if (peek() == EOF) {
unexpected(EOF, "while reading a comment");
} else {
m_token.extend(get());
}
}
}
void make_asterisk_or_power() {
m_token.reset(Token::Kind::ASTERISK);
get(); // eat the asterisk
if (peek() == '*') {
get(); // eat the second asterisk and make a "power" (**)
m_token.set_kind(Token::Kind::POWER);
}
}
void make_quoted(code_point_t c) {
get(); // eat the leading quote
m_token.reset(c == '"' ? Token::Kind::QUOTED_TEXT
: Token::Kind::QUOTED_SYMBOL);
// find closing quote
while (peek() != c) {
if (peek() == EOF)
unexpected(EOF, "while expecting to find a closing quote");
m_token.extend(get());
}
get(); // eat the trailing quote
}
void make_unsigned_int() {
m_token.reset(Token::Kind::UNSIGNED_INT);
while (std::isdigit(peek()))
m_token.extend(get());
}
void append_exponent() {
m_token.extend(get()); // eat the exponent
// eat the optional sign
if ((peek() == '+') or (peek() == '-')) {
m_token.extend(get());
}
// now we *must* find the exponent value of fail miserably
if (std::isdigit(peek())) {
while (std::isdigit(peek()))
m_token.extend(get());
} else {
unexpected(peek(),
"while expecting to find an exponent for a real number");
}
}
void make_unsigned_real_or_only_dot() {
// Works when we are given a dot and wonder whether is denotes a real (by
// being followed by digits and optional exponent) or it's really just a dot
m_token.reset(Token::Kind::DOT);
get(); // eat the dot
if (std::isdigit(peek())) {
// it is a real
m_token.set_kind(Token::Kind::UNSIGNED_REAL);
m_token.extend('.');
while (std::isdigit(peek()))
m_token.extend(get());
// add (optional) exponent
if ((peek() == 'e') or (peek() == 'E'))
append_exponent();
}
}
void make_number_from_digit() {
// Works on the case when we have an unsigned integer which may or may not
// turn out to be a real by virtue of containing either one or both a
// decimal part and exponent.
// start by assuming we are making an unsigned integer
make_unsigned_int();
// if the next character is a dot, we are now dealing with a real
if (peek() == '.') {
m_token.set_kind(Token::Kind::UNSIGNED_REAL);
m_token.extend(get());
// add (optional) decimals
while (std::isdigit(peek()))
m_token.extend(get());
// add (optional) exponent
if (peek() == 'e' or peek() == 'E')
append_exponent();
} else if (peek() == 'e' or peek() == 'E') {
// the next character tells us we are dealing with an scaled real
m_token.set_kind(Token::Kind::UNSIGNED_REAL);
append_exponent();
} else {
// leave the poor unsigned in peace...
}
}
void make_name_or_keyword() {
m_token.reset(Token::Kind::NAME);
// The following complexity stems from the syntactical name requirements:
// (1) they may contain underscores in the middle, but (2) they cannot be
// consecutive, and (3) they cannot be at the end.
while (std::isalnum(peek()) or peek() == '_') {
if (peek() == '_') {
get();
if (std::isalnum(peek())) {
m_token.extend('_');
continue;
} else {
unget();
break;
}
} else {
m_token.extend(get());
}
}
// Now we certainly have a name, but it may be a keyword (such as
// "end_object") --- we ask the token can adjust itself (that is: mark
// itself as keyword instead of name if it indeed contains a keyword)
m_token.maybe_keyword();
}
std::istream *m_is;
value_type m_token;
};
int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "usage: " << argv[0] << " sample.txt\n";
std::exit(0);
}
try {
std::ifstream fp(argv[1]);
if (not fp) {
throw std::runtime_error("error");
}
for (auto it = token_iterator(fp); it != token_iterator(); ++it) {
// std::cout << *it << "\n";
}
std::cout << "Finished\n";
} catch (const std::exception &e) {
std::cerr << "[fatal] " << e.what() << "\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment