Created
October 24, 2024 01:54
-
-
Save jamesgregson/640b694ce8414aff57df9ffa6ab35aef to your computer and use it in GitHub Desktop.
C++20 PEG parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "lex.h" | |
#include <iostream> | |
struct ParserState { | |
lex::UserFnRegistry<const char*> user_fns; | |
std::vector<double> stack; | |
void push( const double & v ){ | |
stack.push_back( v ); | |
} | |
double pop(){ | |
double tmp = stack.back(); | |
stack.pop_back(); | |
return tmp; | |
} | |
}; | |
auto expr_parser( ParserState& S ){ | |
// ================================================== | |
// = Expression stack operations ==================== | |
// ================================================== | |
auto push = [&S]( const std::string& val ){ S.push( std::stod(val) ); }; | |
auto add = [&S](){ S.push(S.pop()+S.pop()); }; | |
auto sub = [&S](){ double tmp=S.pop(); S.push(S.pop()-tmp); }; | |
auto mult = [&S](){ S.push(S.pop()*S.pop()); }; | |
auto div = [&S](){ double tmp=S.pop(); S.push(S.pop()/tmp); }; | |
// ================================================== | |
// = Grammar definitions ============================ | |
// ================================================== | |
// whitespace | |
auto ws = lex::ZeroPlus( lex::whitespace() ); | |
// numbers get pushed onto the expression stack | |
auto number = lex::cb(lex::real() | lex::integer(), push) & ws; | |
// parenthesized expressions bind back to the user_fns table | |
auto paren = '(' & ws & lex::User(S.user_fns.cb("paren")) & ws & ')' & ws; | |
// factors | |
auto factor = paren | number; | |
// terms | |
auto term = factor & lex::Maybe( lex::OnePlus( | |
lex::cb( lex::Char('*') & ws & factor, mult ) | | |
lex::cb( lex::Char('/') & ws & factor, div ) | |
)); | |
// sums | |
auto sum = term & lex::Maybe( lex::OnePlus( | |
lex::cb( lex::Char('+') & ws & term, add ) | | |
lex::cb( lex::Char('-') & ws & term, sub ) | |
)); | |
// expressions are sums and must match end of string | |
auto expr = sum & ws & lex::eof(); | |
// ================================================== | |
// = Wire up recursive definitions ================== | |
// ================================================== | |
S.user_fns.bind("paren",sum); | |
return expr; | |
} | |
int main( int argc, char** argv ){ | |
ParserState S; | |
auto parser = expr_parser(S); | |
// ================================================== | |
// = Wire up recursive definitions ================== | |
// ================================================== | |
const char *src = "(3+3)*(4+(2+1))"; | |
if( auto ret = parser.match(src) ){ | |
std::cout << "Result: " << S.stack.back() << std::endl; | |
} | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <algorithm> | |
#include <concepts> | |
#include <functional> | |
#include <iostream> | |
#include <map> | |
#include <optional> | |
#include <string> | |
namespace lex { | |
/** | |
* @brief Top level pure virtual pattern class only defines match method | |
*/ | |
struct Pattern { | |
/** | |
* @brief Returns advanced source pointer on success or std::nullopt on failure | |
* @param src Source pointer to parse | |
* @return std::optional<const char*> | |
*/ | |
virtual std::optional<const char*> match( const char* src ) const = 0; | |
}; | |
/** | |
* @brief Epsilon, always matches but does not advance source | |
* | |
*/ | |
struct Eps : public Pattern { | |
Eps(){} | |
std::optional<const char*> match( const char* src ) const override { | |
return {src}; | |
} | |
}; | |
/** | |
* @brief Any, matches any single character | |
*/ | |
struct Any : public Pattern { | |
Any(){} | |
std::optional<const char*> match( const char* src ) const override { | |
if( src ){ | |
return {src+1}; | |
} | |
return std::nullopt; | |
} | |
}; | |
/** | |
* @brief Char, matches a single character exactly | |
*/ | |
struct Char : public Pattern { | |
Char( char c ) : _c{c} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( src && *src == _c ){ | |
return {src+1}; | |
} | |
return std::nullopt; | |
} | |
const char _c; | |
}; | |
/** | |
* @brief Range, matches a single character within a character range | |
*/ | |
struct Range : public Pattern { | |
Range( char lo, char hi ) : _lo{lo}, _hi{hi} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( src && *src >= _lo && *src <= _hi ){ | |
return {src+1}; | |
} | |
return std::nullopt; | |
} | |
const char _lo, _hi; | |
}; | |
/** | |
* @brief Str, matches an exact string character by character | |
*/ | |
struct Str : public Pattern { | |
Str( const char* seq ) : _seq{seq} {} | |
std::optional<const char*> match( const char* src ) const override { | |
const char* sptr = _seq; | |
while( *sptr && *src ){ | |
if( *sptr != *src ){ | |
return std::nullopt; | |
} | |
++sptr; | |
++src; | |
} | |
return src; | |
} | |
const char* _seq; | |
}; | |
/** | |
* @brief Zero-plus, matches provided expression zero or more times, greedily | |
*/ | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
struct ZeroPlus : public Pattern { | |
ZeroPlus( const Expr& expr ) : _expr{expr} {} | |
std::optional<const char*> match( const char* src ) const override { | |
while( src ){ | |
if( auto tmp = _expr.match( src ) ){ | |
src = *tmp; | |
} else { | |
return {src}; | |
} | |
} | |
return std::nullopt; | |
} | |
const Expr _expr; | |
}; | |
/** | |
* @brief Matches the left expression greedily, on failure rewinds and matches the right | |
*/ | |
template< typename Left, typename Right > | |
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern> | |
struct Or : public Pattern { | |
Or( const Left& left, const Right& right ) : _left{left}, _right{right} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( auto res = _left.match(src) ){ | |
return res; | |
} | |
return _right.match(src); | |
} | |
const Left _left; | |
const Right _right; | |
}; | |
/** | |
* @brief And, matches the left expression and, on success, matches the right | |
*/ | |
template< typename Left, typename Right > | |
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern> | |
struct And : public Pattern { | |
And( const Left& left, const Right& right ) : _left{left}, _right{right} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( auto tmp = _left.match(src) ){ | |
return _right.match(*tmp); | |
} | |
return std::nullopt; | |
} | |
const Left _left; | |
const Right _right; | |
}; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
Or<Expr,Eps> Maybe( const Expr& expr ){ | |
// Eps must be last so that expr is tried | |
return Or<Expr,Eps>(expr,Eps()); | |
} | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
auto OnePlus( const Expr& expr ){ | |
return And<Expr,ZeroPlus<Expr>>(expr,ZeroPlus(expr)); | |
} | |
// lexer/parser hooks | |
// Wrapper for a user-defined parseing function, also used to | |
// implement recursive grammars | |
using UserFn = std::function<std::optional<const char*>(const char*)>; | |
struct User : public Pattern { | |
User( UserFn fn ) : _fn(fn) {} | |
std::optional<const char*> match( const char* src ) const override { | |
return _fn(src); | |
} | |
UserFn _fn; | |
}; | |
inline User cb( UserFn fn ){ | |
return User(fn); | |
} | |
// called whenever a given token matches | |
using ExistCallbackFn = std::function<void()>; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
struct ExistCallback : public Pattern { | |
ExistCallback( const Expr& expr, ExistCallbackFn fn ) : _expr{expr}, _fn{fn} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( auto ret = _expr.match(src) ){ | |
_fn(); | |
return ret; | |
} | |
return std::nullopt; | |
} | |
const Expr _expr; | |
ExistCallbackFn _fn; | |
}; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
ExistCallback<Expr> cb( const Expr& expr, ExistCallbackFn fn ){ | |
return ExistCallback<Expr>( expr, fn ); | |
} | |
// zero-allocation callback that also provides the matching string | |
using RangeCallbackFn = std::function<void(const char*,const char*)>; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
struct RangeCallback : public Pattern { | |
RangeCallback( const Expr& expr, RangeCallbackFn fn ) : _expr{expr}, _fn{fn} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( auto ret = _expr.match(src) ){ | |
_fn(src,*ret+1); | |
return ret; | |
} | |
return std::nullopt; | |
} | |
const Expr _expr; | |
RangeCallbackFn _fn; | |
}; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
RangeCallback<Expr> cb( const Expr& expr, RangeCallbackFn fn ){ | |
return RangeCallback( expr, fn ); | |
} | |
// callback that provides the matching string | |
using StringCallbackFn = std::function<void(const std::string&)>; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
struct StringCallback : public Pattern { | |
StringCallback( const Expr& expr, StringCallbackFn fn ) : _expr{expr}, _fn{fn} {} | |
std::optional<const char*> match( const char* src ) const override { | |
if( auto ret = _expr.match(src) ){ | |
std::string tmp; | |
std::copy( src, *ret, std::back_inserter(tmp) ); | |
_fn(tmp); | |
return ret; | |
} | |
return std::nullopt; | |
} | |
const Expr _expr; | |
StringCallbackFn _fn; | |
}; | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
StringCallback<Expr> cb( const Expr& expr, StringCallbackFn fn ){ | |
return StringCallback<Expr>( expr, fn ); | |
} | |
template< typename Key > | |
struct UserFnRegistry { | |
UserFnRegistry(){} | |
template< typename Expr > | |
requires std::derived_from<Expr,Pattern> | |
void bind( const Key& key, Expr& expr ){ | |
set( key, std::bind(&Expr::match,expr,std::placeholders::_1) ); | |
} | |
UserFn cb( const Key& key ) const { | |
return std::bind(&UserFnRegistry<Key>::match,this,key,std::placeholders::_1); | |
} | |
std::optional<const char*> match( const Key& key, const char* src ) const { | |
return get(key)(src); | |
} | |
void set( const Key& key, UserFn fn ){ | |
if( _registry.find(key) != _registry.end() ){ | |
throw std::runtime_error("Error: tried to add duplicate UserFn for key."); | |
} | |
_registry[key] = fn; | |
} | |
UserFn get( const Key& key ) const { | |
if( auto it = _registry.find(key) ; it != _registry.end() ){ | |
return it->second; | |
} | |
throw std::runtime_error("Error: UserFn not registered."); | |
return {}; | |
} | |
std::map<Key,UserFn> _registry; | |
}; | |
// Overloaded pattern building operators | |
// pattern | pattern | |
template< typename Left, typename Right > | |
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern> | |
Or<Left,Right> operator|( const Left& L, const Right& R ){ | |
return Or<Left,Right>(L,R); | |
} | |
// '<char>' | pattern | |
template< typename Right > | |
requires std::derived_from<Right,Pattern> | |
Or<Char,Right> operator|( const char L, const Right& R ){ | |
return Or<Char,Right>(Char(L),R); | |
} | |
// pattern | '<char>' | |
template< typename Left > | |
requires std::derived_from<Left,Pattern> | |
Or<Left,Char> operator|( const Left& L, const char R ){ | |
return Or<Left,Char>(L,Char(R)); | |
} | |
// "<string>" | pattern | |
template< typename Right > | |
requires std::derived_from<Right,Pattern> | |
Or<Str,Right> operator|( const char *L, const Right& R ){ | |
return Or<Str,Right>(Str(L),R); | |
} | |
// pattern | "<string>" | |
template< typename Left > | |
requires std::derived_from<Left,Pattern> | |
Or<Left,Str> operator|( const Left& L, const char* R ){ | |
return Or<Left,Str>(L,Str(R)); | |
} | |
// pattern & pattern | |
template< typename Left, typename Right > | |
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern> | |
And<Left,Right> operator&( const Left& L, const Right& R ){ | |
return And<Left,Right>(L,R); | |
} | |
// '<char>' & pattern | |
template< typename Right > | |
requires std::derived_from<Right,Pattern> | |
And<Char,Right> operator&( const char L, const Right& R ){ | |
return And<Char,Right>(Char(L),R); | |
} | |
// pattern & '<char>' | |
template< typename Left > | |
requires std::derived_from<Left,Pattern> | |
And<Left,Char> operator&( const Left& L, const char R ){ | |
return And<Left,Char>(L,Char(R)); | |
} | |
// "<string>" & pattern | |
template< typename Right > | |
requires std::derived_from<Right,Pattern> | |
And<Str,Right> operator&( const char *L, const Right& R ){ | |
return And<Str,Right>(Str(L),R); | |
} | |
// pattern & "<string>" | |
template< typename Left > | |
requires std::derived_from<Left,Pattern> | |
And<Left,Str> operator&( const Left& L, const char* R ){ | |
return And<Left,Str>(L,Str(R)); | |
} | |
// convenience definitions | |
inline Char eof(){ return Char('\0'); } | |
inline Char space(){ return Char(' '); } | |
inline Char tab(){ return Char('\t'); } | |
inline Char carriage_return(){ return Char('\r'); } | |
inline Char newline(){ return Char('\n'); } | |
inline auto whitespace(){ return space() | tab() | carriage_return() | newline(); } | |
inline Range digit(){ return {'0','9'}; } | |
inline Range lower(){ return {'a','z'}; } | |
inline Range upper(){ return {'A','Z'}; } | |
inline auto alpha(){ return lower() | upper(); } | |
inline auto alphanum(){ return alpha() | digit(); } | |
inline auto digits(){ return OnePlus( digit() ); } | |
inline auto pm(){ return Char('+') | Char('-'); } | |
inline auto integer(){ return Maybe(pm()) & digits(); } | |
inline auto real(){ return Maybe(pm()) & digits() & Char('.') & Maybe(digits()) & Maybe( (Char('e')|Char('E')) & Maybe(pm()) & digits() ); } | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment