Created
October 27, 2018 15:56
-
-
Save arrieta/ce2c9293fed5760a09fa026b480dbdce to your computer and use it in GitHub Desktop.
PDS OBJ Parser Excercise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cctype> | |
#include <fstream> | |
#include <iostream> | |
#include <iterator> | |
#include <sstream> | |
#include <stdexcept> | |
#include <string> | |
inline bool semantic_compare(std::size_t n, const char *value, | |
const std::string &s) { | |
for (std::size_t index = 0; index < n; ++index) { | |
if (*(value + index) != std::tolower(s[index])) | |
return false; | |
} | |
return true; | |
} | |
class Token { | |
public: | |
enum class Kind { | |
END_OF_STREAM, | |
NAME, | |
PLUS, | |
MINUS, | |
ASTERISK, | |
POWER, | |
SOLIDUS, | |
EQUAL, | |
GT, | |
LT, | |
COLON, | |
DOT, | |
CIRCUMFLEX, | |
POUND, | |
UNDERSCORE, | |
LPAR, | |
RPAR, | |
LCURLY, | |
RCURLY, | |
COMMA, | |
OPEN_OBJECT, | |
CLOSE_OBJECT, | |
OPEN_GROUP, | |
CLOSE_GROUP, | |
END_LABEL, | |
QUOTED_TEXT, | |
QUOTED_SYMBOL, | |
UNSIGNED_INT, | |
UNSIGNED_REAL, | |
COMMENT, | |
}; | |
Token(Kind kind = Token::Kind::END_OF_STREAM) : m_kind{kind}, m_lexeme{} {} | |
bool is(Kind kind) const { return m_kind == kind; } | |
Kind kind() const { return m_kind; } | |
const std::string &lexeme() const { return m_lexeme; } | |
void set_kind(Kind kind) { m_kind = kind; } | |
void extend(char c) { m_lexeme.push_back(c); } | |
void clear() { m_lexeme.clear(); } | |
void reset(Token::Kind kind) { | |
set_kind(kind); | |
clear(); | |
} | |
void maybe_keyword() { | |
switch (m_lexeme.size()) { | |
case 3: | |
if (semantic_compare(3, "end", m_lexeme)) | |
reset(Token::Kind::END_LABEL); | |
break; | |
case 5: | |
if (semantic_compare(5, "group", m_lexeme)) | |
reset(Token::Kind::OPEN_GROUP); | |
break; | |
case 6: | |
if (semantic_compare(6, "object", m_lexeme)) | |
reset(Token::Kind::OPEN_OBJECT); | |
break; | |
case 9: | |
if (semantic_compare(9, "end_group", m_lexeme)) | |
reset(Token::Kind::CLOSE_GROUP); | |
break; | |
case 10: | |
if (semantic_compare(10, "end_object", m_lexeme)) | |
reset(Token::Kind::CLOSE_OBJECT); | |
break; | |
default: | |
return; | |
} | |
} | |
private: | |
Kind m_kind; | |
std::string m_lexeme; | |
}; | |
std::ostream &operator<<(std::ostream &os, const Token &t) { | |
static constexpr const char *const TOKEN_KIND_NAMES[]{"END_OF_STREAM", | |
"NAME", | |
"PLUS", | |
"MINUS", | |
"ASTERISK", | |
"POWER", | |
"SOLIDUS", | |
"EQUAL", | |
"GT", | |
"LT", | |
"COLON", | |
"DOT", | |
"CIRCUMFLEX", | |
"POUND", | |
"UNDERSCORE", | |
"LPAR", | |
"RPAR", | |
"LCURLY", | |
"RCURLY", | |
"COMMA", | |
"OPEN_OBJECT", | |
"CLOSE_OBJECT", | |
"OPEN_GROUP", | |
"CLOSE_GROUP", | |
"END_LABEL", | |
"QUOTED_TEXT", | |
"QUOTED_SYMBOL", | |
"UNSIGNED_INT", | |
"UNSIGNED_REAL", | |
"COMMENT"}; | |
return os << "{" << TOKEN_KIND_NAMES[static_cast<int>(t.kind())] << ", " | |
<< t.lexeme() << "}"; | |
} | |
class token_iterator : public std::iterator<std::input_iterator_tag, Token> { | |
public: | |
using code_point_t = typename std::istream::char_type; | |
using value_type = Token; | |
using reference = const Token &; | |
using pointer = const Token *; | |
token_iterator(std::istream &is) : m_is{std::addressof(is)}, m_token{} { | |
find_next_token(); | |
} | |
token_iterator() : m_is{nullptr}, m_token{} {} | |
reference operator*() const { return m_token; } | |
pointer operator->() const { return &(operator*()); } | |
token_iterator &operator++() { | |
find_next_token(); | |
return *this; | |
} | |
token_iterator operator++(int) { | |
token_iterator temp = *this; | |
++(*this); | |
return temp; | |
} | |
friend bool operator==(const token_iterator &lhs, const token_iterator &rhs) { | |
return lhs.m_is == rhs.m_is; | |
} | |
friend bool operator!=(const token_iterator &lhs, const token_iterator &rhs) { | |
return not(lhs == rhs); | |
} | |
private: | |
void unexpected(code_point_t c, const char *when = "while lexing input") { | |
std::ostringstream os; | |
if (c == EOF) { | |
os << "found unexpected end of file "; | |
} else { | |
os << "found unexpected character '" << c << "' "; | |
} | |
os << when << "."; | |
throw std::runtime_error(os.str()); | |
} | |
code_point_t get() { return m_is->get(); } | |
code_point_t peek() { return m_is->peek(); } | |
void unget() { m_is->unget(); } | |
void yield_atom(Token::Kind kind) { | |
get(); | |
m_token.reset(kind); | |
} | |
void finalize() { | |
yield_atom(Token::Kind::END_OF_STREAM); | |
m_is = nullptr; | |
} | |
void skip_whitespace() { | |
while (std::isspace(peek())) | |
get(); | |
} | |
void find_next_token() { | |
skip_whitespace(); | |
switch (code_point_t c = peek()) { | |
case EOF: | |
return finalize(); | |
case '+': | |
return yield_atom(Token::Kind::PLUS); | |
case '-': | |
return yield_atom(Token::Kind::MINUS); | |
case '=': | |
return yield_atom(Token::Kind::EQUAL); | |
case '^': | |
return yield_atom(Token::Kind::CIRCUMFLEX); | |
case '#': | |
return yield_atom(Token::Kind::POUND); | |
case '_': | |
return yield_atom(Token::Kind::UNDERSCORE); | |
case '>': | |
return yield_atom(Token::Kind::GT); | |
case '<': | |
return yield_atom(Token::Kind::LT); | |
case ':': | |
return yield_atom(Token::Kind::COLON); | |
case '(': | |
return yield_atom(Token::Kind::LPAR); | |
case ')': | |
return yield_atom(Token::Kind::RPAR); | |
case '{': | |
return yield_atom(Token::Kind::LCURLY); | |
case '}': | |
return yield_atom(Token::Kind::RCURLY); | |
case ',': | |
return yield_atom(Token::Kind::COMMA); | |
default: | |
return make_nonterminal(c); | |
} | |
} | |
void make_nonterminal(code_point_t c) { | |
switch (c) { | |
case '\'': | |
return make_quoted(c); | |
case '"': | |
return make_quoted(c); | |
case '.': | |
return make_unsigned_real_or_only_dot(); | |
case '*': | |
return make_asterisk_or_power(); | |
case '/': | |
return make_comment_or_solidus(); | |
default: | |
if (std::isdigit(c)) | |
return make_number_from_digit(); | |
if (std::isalpha(c)) | |
return make_name_or_keyword(); | |
} | |
unexpected(c); | |
} | |
void make_comment_or_solidus() { | |
// todo: the spec does not allow for multi-line comments, but I do. Shoudl I | |
// stick to the spec? | |
// at this point we have a solidus which may or may not be a comment... | |
// let's see | |
m_token.reset(Token::Kind::SOLIDUS); | |
get(); // eat the solidus | |
// return quickly of we don't have a comment | |
if (peek() != '*') | |
return; | |
// ok.. this is a comment. We must find the comment terminator or fail | |
// miserably | |
m_token.set_kind(Token::Kind::COMMENT); | |
get(); // eat the asterisk | |
while (true) { | |
if (peek() == '*') { | |
get(); | |
if (peek() == '/') { | |
get(); | |
return; | |
} else { | |
m_token.extend('*'); | |
} | |
} else if (peek() == EOF) { | |
unexpected(EOF, "while reading a comment"); | |
} else { | |
m_token.extend(get()); | |
} | |
} | |
} | |
void make_asterisk_or_power() { | |
m_token.reset(Token::Kind::ASTERISK); | |
get(); // eat the asterisk | |
if (peek() == '*') { | |
get(); // eat the second asterisk and make a "power" (**) | |
m_token.set_kind(Token::Kind::POWER); | |
} | |
} | |
void make_quoted(code_point_t c) { | |
get(); // eat the leading quote | |
m_token.reset(c == '"' ? Token::Kind::QUOTED_TEXT | |
: Token::Kind::QUOTED_SYMBOL); | |
// find closing quote | |
while (peek() != c) { | |
if (peek() == EOF) | |
unexpected(EOF, "while expecting to find a closing quote"); | |
m_token.extend(get()); | |
} | |
get(); // eat the trailing quote | |
} | |
void make_unsigned_int() { | |
m_token.reset(Token::Kind::UNSIGNED_INT); | |
while (std::isdigit(peek())) | |
m_token.extend(get()); | |
} | |
void append_exponent() { | |
m_token.extend(get()); // eat the exponent | |
// eat the optional sign | |
if ((peek() == '+') or (peek() == '-')) { | |
m_token.extend(get()); | |
} | |
// now we *must* find the exponent value of fail miserably | |
if (std::isdigit(peek())) { | |
while (std::isdigit(peek())) | |
m_token.extend(get()); | |
} else { | |
unexpected(peek(), | |
"while expecting to find an exponent for a real number"); | |
} | |
} | |
void make_unsigned_real_or_only_dot() { | |
// Works when we are given a dot and wonder whether is denotes a real (by | |
// being followed by digits and optional exponent) or it's really just a dot | |
m_token.reset(Token::Kind::DOT); | |
get(); // eat the dot | |
if (std::isdigit(peek())) { | |
// it is a real | |
m_token.set_kind(Token::Kind::UNSIGNED_REAL); | |
m_token.extend('.'); | |
while (std::isdigit(peek())) | |
m_token.extend(get()); | |
// add (optional) exponent | |
if ((peek() == 'e') or (peek() == 'E')) | |
append_exponent(); | |
} | |
} | |
void make_number_from_digit() { | |
// Works on the case when we have an unsigned integer which may or may not | |
// turn out to be a real by virtue of containing either one or both a | |
// decimal part and exponent. | |
// start by assuming we are making an unsigned integer | |
make_unsigned_int(); | |
// if the next character is a dot, we are now dealing with a real | |
if (peek() == '.') { | |
m_token.set_kind(Token::Kind::UNSIGNED_REAL); | |
m_token.extend(get()); | |
// add (optional) decimals | |
while (std::isdigit(peek())) | |
m_token.extend(get()); | |
// add (optional) exponent | |
if (peek() == 'e' or peek() == 'E') | |
append_exponent(); | |
} else if (peek() == 'e' or peek() == 'E') { | |
// the next character tells us we are dealing with an scaled real | |
m_token.set_kind(Token::Kind::UNSIGNED_REAL); | |
append_exponent(); | |
} else { | |
// leave the poor unsigned in peace... | |
} | |
} | |
void make_name_or_keyword() { | |
m_token.reset(Token::Kind::NAME); | |
// The following complexity stems from the syntactical name requirements: | |
// (1) they may contain underscores in the middle, but (2) they cannot be | |
// consecutive, and (3) they cannot be at the end. | |
while (std::isalnum(peek()) or peek() == '_') { | |
if (peek() == '_') { | |
get(); | |
if (std::isalnum(peek())) { | |
m_token.extend('_'); | |
continue; | |
} else { | |
unget(); | |
break; | |
} | |
} else { | |
m_token.extend(get()); | |
} | |
} | |
// Now we certainly have a name, but it may be a keyword (such as | |
// "end_object") --- we ask the token can adjust itself (that is: mark | |
// itself as keyword instead of name if it indeed contains a keyword) | |
m_token.maybe_keyword(); | |
} | |
std::istream *m_is; | |
value_type m_token; | |
}; | |
int main(int argc, char *argv[]) { | |
if (argc != 2) { | |
std::cerr << "usage: " << argv[0] << " sample.txt\n"; | |
std::exit(0); | |
} | |
try { | |
std::ifstream fp(argv[1]); | |
if (not fp) { | |
throw std::runtime_error("error"); | |
} | |
for (auto it = token_iterator(fp); it != token_iterator(); ++it) { | |
// std::cout << *it << "\n"; | |
} | |
std::cout << "Finished\n"; | |
} catch (const std::exception &e) { | |
std::cerr << "[fatal] " << e.what() << "\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment