Skip to content

Instantly share code, notes, and snippets.

@jamesgregson
Created October 24, 2024 01:54
Show Gist options
  • Save jamesgregson/640b694ce8414aff57df9ffa6ab35aef to your computer and use it in GitHub Desktop.
Save jamesgregson/640b694ce8414aff57df9ffa6ab35aef to your computer and use it in GitHub Desktop.
C++20 PEG parser
#include "lex.h"
#include <iostream>
struct ParserState {
lex::UserFnRegistry<const char*> user_fns;
std::vector<double> stack;
void push( const double & v ){
stack.push_back( v );
}
double pop(){
double tmp = stack.back();
stack.pop_back();
return tmp;
}
};
auto expr_parser( ParserState& S ){
// ==================================================
// = Expression stack operations ====================
// ==================================================
auto push = [&S]( const std::string& val ){ S.push( std::stod(val) ); };
auto add = [&S](){ S.push(S.pop()+S.pop()); };
auto sub = [&S](){ double tmp=S.pop(); S.push(S.pop()-tmp); };
auto mult = [&S](){ S.push(S.pop()*S.pop()); };
auto div = [&S](){ double tmp=S.pop(); S.push(S.pop()/tmp); };
// ==================================================
// = Grammar definitions ============================
// ==================================================
// whitespace
auto ws = lex::ZeroPlus( lex::whitespace() );
// numbers get pushed onto the expression stack
auto number = lex::cb(lex::real() | lex::integer(), push) & ws;
// parenthesized expressions bind back to the user_fns table
auto paren = '(' & ws & lex::User(S.user_fns.cb("paren")) & ws & ')' & ws;
// factors
auto factor = paren | number;
// terms
auto term = factor & lex::Maybe( lex::OnePlus(
lex::cb( lex::Char('*') & ws & factor, mult ) |
lex::cb( lex::Char('/') & ws & factor, div )
));
// sums
auto sum = term & lex::Maybe( lex::OnePlus(
lex::cb( lex::Char('+') & ws & term, add ) |
lex::cb( lex::Char('-') & ws & term, sub )
));
// expressions are sums and must match end of string
auto expr = sum & ws & lex::eof();
// ==================================================
// = Wire up recursive definitions ==================
// ==================================================
S.user_fns.bind("paren",sum);
return expr;
}
int main( int argc, char** argv ){
ParserState S;
auto parser = expr_parser(S);
// ==================================================
// = Wire up recursive definitions ==================
// ==================================================
const char *src = "(3+3)*(4+(2+1))";
if( auto ret = parser.match(src) ){
std::cout << "Result: " << S.stack.back() << std::endl;
}
return 0;
}
#pragma once
#include <algorithm>
#include <concepts>
#include <functional>
#include <iostream>
#include <map>
#include <optional>
#include <string>
namespace lex {
/**
* @brief Top level pure virtual pattern class only defines match method
*/
struct Pattern {
/**
* @brief Returns advanced source pointer on success or std::nullopt on failure
* @param src Source pointer to parse
* @return std::optional<const char*>
*/
virtual std::optional<const char*> match( const char* src ) const = 0;
};
/**
* @brief Epsilon, always matches but does not advance source
*
*/
struct Eps : public Pattern {
Eps(){}
std::optional<const char*> match( const char* src ) const override {
return {src};
}
};
/**
* @brief Any, matches any single character
*/
struct Any : public Pattern {
Any(){}
std::optional<const char*> match( const char* src ) const override {
if( src ){
return {src+1};
}
return std::nullopt;
}
};
/**
* @brief Char, matches a single character exactly
*/
struct Char : public Pattern {
Char( char c ) : _c{c} {}
std::optional<const char*> match( const char* src ) const override {
if( src && *src == _c ){
return {src+1};
}
return std::nullopt;
}
const char _c;
};
/**
* @brief Range, matches a single character within a character range
*/
struct Range : public Pattern {
Range( char lo, char hi ) : _lo{lo}, _hi{hi} {}
std::optional<const char*> match( const char* src ) const override {
if( src && *src >= _lo && *src <= _hi ){
return {src+1};
}
return std::nullopt;
}
const char _lo, _hi;
};
/**
* @brief Str, matches an exact string character by character
*/
struct Str : public Pattern {
Str( const char* seq ) : _seq{seq} {}
std::optional<const char*> match( const char* src ) const override {
const char* sptr = _seq;
while( *sptr && *src ){
if( *sptr != *src ){
return std::nullopt;
}
++sptr;
++src;
}
return src;
}
const char* _seq;
};
/**
* @brief Zero-plus, matches provided expression zero or more times, greedily
*/
template< typename Expr >
requires std::derived_from<Expr,Pattern>
struct ZeroPlus : public Pattern {
ZeroPlus( const Expr& expr ) : _expr{expr} {}
std::optional<const char*> match( const char* src ) const override {
while( src ){
if( auto tmp = _expr.match( src ) ){
src = *tmp;
} else {
return {src};
}
}
return std::nullopt;
}
const Expr _expr;
};
/**
* @brief Matches the left expression greedily, on failure rewinds and matches the right
*/
template< typename Left, typename Right >
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern>
struct Or : public Pattern {
Or( const Left& left, const Right& right ) : _left{left}, _right{right} {}
std::optional<const char*> match( const char* src ) const override {
if( auto res = _left.match(src) ){
return res;
}
return _right.match(src);
}
const Left _left;
const Right _right;
};
/**
* @brief And, matches the left expression and, on success, matches the right
*/
template< typename Left, typename Right >
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern>
struct And : public Pattern {
And( const Left& left, const Right& right ) : _left{left}, _right{right} {}
std::optional<const char*> match( const char* src ) const override {
if( auto tmp = _left.match(src) ){
return _right.match(*tmp);
}
return std::nullopt;
}
const Left _left;
const Right _right;
};
template< typename Expr >
requires std::derived_from<Expr,Pattern>
Or<Expr,Eps> Maybe( const Expr& expr ){
// Eps must be last so that expr is tried
return Or<Expr,Eps>(expr,Eps());
}
template< typename Expr >
requires std::derived_from<Expr,Pattern>
auto OnePlus( const Expr& expr ){
return And<Expr,ZeroPlus<Expr>>(expr,ZeroPlus(expr));
}
// lexer/parser hooks
// Wrapper for a user-defined parseing function, also used to
// implement recursive grammars
using UserFn = std::function<std::optional<const char*>(const char*)>;
struct User : public Pattern {
User( UserFn fn ) : _fn(fn) {}
std::optional<const char*> match( const char* src ) const override {
return _fn(src);
}
UserFn _fn;
};
inline User cb( UserFn fn ){
return User(fn);
}
// called whenever a given token matches
using ExistCallbackFn = std::function<void()>;
template< typename Expr >
requires std::derived_from<Expr,Pattern>
struct ExistCallback : public Pattern {
ExistCallback( const Expr& expr, ExistCallbackFn fn ) : _expr{expr}, _fn{fn} {}
std::optional<const char*> match( const char* src ) const override {
if( auto ret = _expr.match(src) ){
_fn();
return ret;
}
return std::nullopt;
}
const Expr _expr;
ExistCallbackFn _fn;
};
template< typename Expr >
requires std::derived_from<Expr,Pattern>
ExistCallback<Expr> cb( const Expr& expr, ExistCallbackFn fn ){
return ExistCallback<Expr>( expr, fn );
}
// zero-allocation callback that also provides the matching string
using RangeCallbackFn = std::function<void(const char*,const char*)>;
template< typename Expr >
requires std::derived_from<Expr,Pattern>
struct RangeCallback : public Pattern {
RangeCallback( const Expr& expr, RangeCallbackFn fn ) : _expr{expr}, _fn{fn} {}
std::optional<const char*> match( const char* src ) const override {
if( auto ret = _expr.match(src) ){
_fn(src,*ret+1);
return ret;
}
return std::nullopt;
}
const Expr _expr;
RangeCallbackFn _fn;
};
template< typename Expr >
requires std::derived_from<Expr,Pattern>
RangeCallback<Expr> cb( const Expr& expr, RangeCallbackFn fn ){
return RangeCallback( expr, fn );
}
// callback that provides the matching string
using StringCallbackFn = std::function<void(const std::string&)>;
template< typename Expr >
requires std::derived_from<Expr,Pattern>
struct StringCallback : public Pattern {
StringCallback( const Expr& expr, StringCallbackFn fn ) : _expr{expr}, _fn{fn} {}
std::optional<const char*> match( const char* src ) const override {
if( auto ret = _expr.match(src) ){
std::string tmp;
std::copy( src, *ret, std::back_inserter(tmp) );
_fn(tmp);
return ret;
}
return std::nullopt;
}
const Expr _expr;
StringCallbackFn _fn;
};
template< typename Expr >
requires std::derived_from<Expr,Pattern>
StringCallback<Expr> cb( const Expr& expr, StringCallbackFn fn ){
return StringCallback<Expr>( expr, fn );
}
template< typename Key >
struct UserFnRegistry {
UserFnRegistry(){}
template< typename Expr >
requires std::derived_from<Expr,Pattern>
void bind( const Key& key, Expr& expr ){
set( key, std::bind(&Expr::match,expr,std::placeholders::_1) );
}
UserFn cb( const Key& key ) const {
return std::bind(&UserFnRegistry<Key>::match,this,key,std::placeholders::_1);
}
std::optional<const char*> match( const Key& key, const char* src ) const {
return get(key)(src);
}
void set( const Key& key, UserFn fn ){
if( _registry.find(key) != _registry.end() ){
throw std::runtime_error("Error: tried to add duplicate UserFn for key.");
}
_registry[key] = fn;
}
UserFn get( const Key& key ) const {
if( auto it = _registry.find(key) ; it != _registry.end() ){
return it->second;
}
throw std::runtime_error("Error: UserFn not registered.");
return {};
}
std::map<Key,UserFn> _registry;
};
// Overloaded pattern building operators
// pattern | pattern
template< typename Left, typename Right >
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern>
Or<Left,Right> operator|( const Left& L, const Right& R ){
return Or<Left,Right>(L,R);
}
// '<char>' | pattern
template< typename Right >
requires std::derived_from<Right,Pattern>
Or<Char,Right> operator|( const char L, const Right& R ){
return Or<Char,Right>(Char(L),R);
}
// pattern | '<char>'
template< typename Left >
requires std::derived_from<Left,Pattern>
Or<Left,Char> operator|( const Left& L, const char R ){
return Or<Left,Char>(L,Char(R));
}
// "<string>" | pattern
template< typename Right >
requires std::derived_from<Right,Pattern>
Or<Str,Right> operator|( const char *L, const Right& R ){
return Or<Str,Right>(Str(L),R);
}
// pattern | "<string>"
template< typename Left >
requires std::derived_from<Left,Pattern>
Or<Left,Str> operator|( const Left& L, const char* R ){
return Or<Left,Str>(L,Str(R));
}
// pattern & pattern
template< typename Left, typename Right >
requires std::derived_from<Left,Pattern> && std::derived_from<Right,Pattern>
And<Left,Right> operator&( const Left& L, const Right& R ){
return And<Left,Right>(L,R);
}
// '<char>' & pattern
template< typename Right >
requires std::derived_from<Right,Pattern>
And<Char,Right> operator&( const char L, const Right& R ){
return And<Char,Right>(Char(L),R);
}
// pattern & '<char>'
template< typename Left >
requires std::derived_from<Left,Pattern>
And<Left,Char> operator&( const Left& L, const char R ){
return And<Left,Char>(L,Char(R));
}
// "<string>" & pattern
template< typename Right >
requires std::derived_from<Right,Pattern>
And<Str,Right> operator&( const char *L, const Right& R ){
return And<Str,Right>(Str(L),R);
}
// pattern & "<string>"
template< typename Left >
requires std::derived_from<Left,Pattern>
And<Left,Str> operator&( const Left& L, const char* R ){
return And<Left,Str>(L,Str(R));
}
// convenience definitions
inline Char eof(){ return Char('\0'); }
inline Char space(){ return Char(' '); }
inline Char tab(){ return Char('\t'); }
inline Char carriage_return(){ return Char('\r'); }
inline Char newline(){ return Char('\n'); }
inline auto whitespace(){ return space() | tab() | carriage_return() | newline(); }
inline Range digit(){ return {'0','9'}; }
inline Range lower(){ return {'a','z'}; }
inline Range upper(){ return {'A','Z'}; }
inline auto alpha(){ return lower() | upper(); }
inline auto alphanum(){ return alpha() | digit(); }
inline auto digits(){ return OnePlus( digit() ); }
inline auto pm(){ return Char('+') | Char('-'); }
inline auto integer(){ return Maybe(pm()) & digits(); }
inline auto real(){ return Maybe(pm()) & digits() & Char('.') & Maybe(digits()) & Maybe( (Char('e')|Char('E')) & Maybe(pm()) & digits() ); }
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment