Skip to content

Instantly share code, notes, and snippets.

@DarinM223
Last active May 25, 2017 09:39
Show Gist options
  • Save DarinM223/cce714954ce675da28e18c0807da380d to your computer and use it in GitHub Desktop.
Save DarinM223/cce714954ce675da28e18c0807da380d to your computer and use it in GitHub Desktop.
Simple parser for arithmetic expressions using precedence climbing
#[macro_use]
extern crate lazy_static;
extern crate regex;
use regex::{CaptureMatches, Regex};
use std::collections::HashMap;
use std::num::ParseIntError;
use std::result;
use std::str::FromStr;
#[derive(Debug)]
pub enum TokenErr {
InvalidToken,
ParseIntError(ParseIntError),
}
impl From<ParseIntError> for TokenErr {
fn from(err: ParseIntError) -> TokenErr {
TokenErr::ParseIntError(err)
}
}
#[derive(Debug)]
pub enum ParseErr {
UnexpectedEnd,
ExpectedValue,
UnmatchedParen,
TokenErr(TokenErr),
}
impl From<TokenErr> for ParseErr {
fn from(err: TokenErr) -> ParseErr {
ParseErr::TokenErr(err)
}
}
pub type TokenResult<T> = result::Result<T, TokenErr>;
pub type ParseResult<T> = result::Result<T, ParseErr>;
/// Determines whether similar precedence operators
/// are grouped from left to right or right to left.
///
/// Left association example:
/// 1 + 2 + 3 => (1 + 2) + 3
///
/// Right association example:
/// 2 ^ 3 ^ 4 => 2 ^ (3 ^ 4)
#[derive(Clone, Copy, PartialEq, Eq)]
enum Assoc {
Left,
Right,
}
/// A parsable operator. Right now restricted
/// to basic math operations and exponent.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum Op {
Plus,
Minus,
Times,
Divide,
Exponent,
}
impl Op {
pub fn from_str<'a>(s: &'a str) -> TokenResult<Op> {
match s {
"+" => Ok(Op::Plus),
"-" => Ok(Op::Minus),
"*" => Ok(Op::Times),
"/" => Ok(Op::Divide),
"^" => Ok(Op::Exponent),
_ => Err(TokenErr::InvalidToken),
}
}
}
/// A token returned from the tokenizer.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Token {
BinOp(Op),
Num(i32),
LeftParen,
RightParen,
}
/// An expression returned from the parser.
/// Because the parser is only for simple number operations,
/// an expression can only be an integer number or a binary operation.
#[derive(Debug, PartialEq)]
pub enum Expr {
Value(i32),
BinOp(Op, Box<Expr>, Box<Expr>),
}
struct OpInfo {
prec: i32,
assoc: Assoc,
}
/// Global "constants".
lazy_static! {
/// Regex used by the tokenizer to parse numbers and operators.
static ref REGEX: Regex = Regex::new(r###"\s*(?:(\d+)|(.))"###).unwrap();
/// Table of operator to the operator's precedence and association.
static ref OP_INFO: HashMap<Op, OpInfo> = {
let mut map = HashMap::new();
map.insert(Op::Plus, OpInfo { prec: 1, assoc: Assoc::Left });
map.insert(Op::Minus, OpInfo { prec: 1, assoc: Assoc::Left });
map.insert(Op::Times, OpInfo { prec: 2, assoc: Assoc::Left });
map.insert(Op::Divide, OpInfo { prec: 2, assoc: Assoc::Left });
map.insert(Op::Exponent, OpInfo { prec: 3, assoc: Assoc::Right });
map
};
}
/// A cursor through valid tokens in a string that moves from left to right.
/// Example: 1 + (2 * 3)
///
/// Gets split into tokens and the current position moves to the right
/// starting from an empty invalid position.
///
/// [1] [+] [(] [2] [*] [3] [)]
/// ^
/// |
/// curr -------------------------->
pub struct Tokenizer<'a> {
curr_token: Option<Token>,
tokens: CaptureMatches<'static, 'a>,
}
impl<'a> Tokenizer<'a> {
pub fn new(text: &'a str) -> Tokenizer<'a> {
Tokenizer {
curr_token: None,
tokens: REGEX.captures_iter(text),
}
}
/// Returns the token the tokenizer is currently on.
pub fn curr_token(&self) -> Option<Token> {
self.curr_token
}
/// Moves the position to the next token.
pub fn get_next_token(&mut self) -> TokenResult<()> {
if let Some(token) = self.tokens.next() {
let token = match (token.get(1), token.get(2)) {
(Some(num), _) => Token::Num(i32::from_str(num.as_str())?),
(_, Some(op)) if op.as_str() == "(" => Token::LeftParen,
(_, Some(op)) if op.as_str() == ")" => Token::RightParen,
(_, Some(op)) if Op::from_str(op.as_str()).is_ok() => {
let op = Op::from_str(op.as_str())?;
Token::BinOp(op)
}
_ => return Err(TokenErr::InvalidToken),
};
self.curr_token = Some(token);
}
Ok(())
}
}
/// A precedence climbing parser for
/// parsing simple mathmatical expressions.
pub struct Parser<'a> {
tokenizer: Tokenizer<'a>,
}
impl<'a> Parser<'a> {
pub fn new(text: &'a str) -> ParseResult<Parser<'a>> {
let mut tokenizer = Tokenizer::new(text);
tokenizer.get_next_token()?;
Ok(Parser { tokenizer: tokenizer })
}
/// Parses a value which can be on either side of an operator expression,
/// and moves the tokenizer position to the token after the value.
///
/// Examples:
///
/// 1 * 2 + 3 => 1 * 2 + 3 , returns Expr::value(1)
/// ^ ^
/// | |
/// token pos token pos
///
/// 1 + (2 * 3) + 5 => 1 + (2 * 3) + 5 , returns Expr::BinOp(
/// ^ ^ Op::Times,
/// | | Box::new(Expr::Value(2)),
/// token pos token pos Box::new(Expr::Value(3))
/// )
pub fn parse_value(&mut self) -> ParseResult<Expr> {
match self.tokenizer.curr_token() {
Some(Token::LeftParen) => {
self.tokenizer.get_next_token()?;
let value_expr = self.parse_expr(1)?;
if self.tokenizer.curr_token() != Some(Token::RightParen) {
return Err(ParseErr::UnmatchedParen);
}
self.tokenizer.get_next_token()?;
Ok(value_expr)
}
Some(Token::Num(num)) => {
self.tokenizer.get_next_token()?;
Ok(Expr::Value(num))
}
Some(_) => Err(ParseErr::ExpectedValue),
None => Err(ParseErr::UnexpectedEnd),
}
}
/// Parses an binary operator expression.
///
/// One reason why parsing operators with precedence looks tricky is
/// because at a certain position you can't know if there's an operator
/// further ahead with a higher precedence.
///
/// For example, parsing 1 + 2 * 3 + 5 while being at the '+' position:
/// ```
/// 1 + 2 * 3 + 5 <--- how to know if theres an operator further ahead
/// ^ of the '+' that has higher precedence?
/// ```
///
/// Recursion is the natural solution to this problem. At the '+' you don't
/// care about the right hand side, you expect the recursive call to properly
/// handle the precedence.
///
/// ```
/// 1 + (recursive function yielding an expression) ......(extra stuff)
/// ^
/// ```
///
/// In this case the recursive function should yield (2 * 3) because (2 * 3) has higher
/// precedence and '+' is left associative so 1 has to be added to (2 * 3) before 5 is added.
///
/// The new expression 1 + (2 * 3) has proper precedence but there is still the + 5 left.
/// So (1 + (2 * 3)) is made the new left value and '+' the new operator and it repeats:
///
/// ```
/// (1 + (2 * 3)) + (recursive function yielding an expression) ....(extra stuff)
/// ```
///
/// This repeats until there is no more valid binary operators.
///
/// This function needs to take the minimum precedence so that the recursive calls can
/// know when to return early. The minimum precedence starts at 1 in the base call and
/// increases in the recursive calls based on the operator encountered.
pub fn parse_expr(&mut self, min_prec: i32) -> ParseResult<Expr> {
let mut left_value = self.parse_value()?;
// Continually link the left hand side to the recursively computed
// right hand side until there is a precedence conflict or an invalid token.
loop {
let token = self.tokenizer.curr_token();
let op = match token {
Some(Token::BinOp(ref op)) if OP_INFO[op].prec < min_prec => break,
Some(Token::BinOp(op)) => op,
_ => break,
};
let OpInfo { prec, assoc } = OP_INFO[&op];
// Left associative operators will pass in precedence + 1 in order
// to prevent same precedence operators on the right from beating them.
// Example: Precedence for '+' is 1. 1 + 1 = 2 which only allows '*', '/', and '^'
// to beat it since they are >= 2.
let next_min_prec = if assoc == Assoc::Left { prec + 1 } else { prec };
// Recursively compute the right hand expression and finish the binop expression.
self.tokenizer.get_next_token()?;
let right_value = self.parse_expr(next_min_prec)?;
left_value = Expr::BinOp(op, Box::new(left_value), Box::new(right_value));
}
Ok(left_value)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment