Last active
May 25, 2017 09:39
-
-
Save DarinM223/cce714954ce675da28e18c0807da380d to your computer and use it in GitHub Desktop.
Simple parser for arithmetic expressions using precedence climbing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[macro_use] | |
extern crate lazy_static; | |
extern crate regex; | |
use regex::{CaptureMatches, Regex}; | |
use std::collections::HashMap; | |
use std::num::ParseIntError; | |
use std::result; | |
use std::str::FromStr; | |
#[derive(Debug)] | |
pub enum TokenErr { | |
InvalidToken, | |
ParseIntError(ParseIntError), | |
} | |
impl From<ParseIntError> for TokenErr { | |
fn from(err: ParseIntError) -> TokenErr { | |
TokenErr::ParseIntError(err) | |
} | |
} | |
#[derive(Debug)] | |
pub enum ParseErr { | |
UnexpectedEnd, | |
ExpectedValue, | |
UnmatchedParen, | |
TokenErr(TokenErr), | |
} | |
impl From<TokenErr> for ParseErr { | |
fn from(err: TokenErr) -> ParseErr { | |
ParseErr::TokenErr(err) | |
} | |
} | |
pub type TokenResult<T> = result::Result<T, TokenErr>; | |
pub type ParseResult<T> = result::Result<T, ParseErr>; | |
/// Determines whether similar precedence operators | |
/// are grouped from left to right or right to left. | |
/// | |
/// Left association example: | |
/// 1 + 2 + 3 => (1 + 2) + 3 | |
/// | |
/// Right association example: | |
/// 2 ^ 3 ^ 4 => 2 ^ (3 ^ 4) | |
#[derive(Clone, Copy, PartialEq, Eq)] | |
enum Assoc { | |
Left, | |
Right, | |
} | |
/// A parsable operator. Right now restricted | |
/// to basic math operations and exponent. | |
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] | |
pub enum Op { | |
Plus, | |
Minus, | |
Times, | |
Divide, | |
Exponent, | |
} | |
impl Op { | |
pub fn from_str<'a>(s: &'a str) -> TokenResult<Op> { | |
match s { | |
"+" => Ok(Op::Plus), | |
"-" => Ok(Op::Minus), | |
"*" => Ok(Op::Times), | |
"/" => Ok(Op::Divide), | |
"^" => Ok(Op::Exponent), | |
_ => Err(TokenErr::InvalidToken), | |
} | |
} | |
} | |
/// A token returned from the tokenizer. | |
#[derive(Clone, Copy, Debug, PartialEq)] | |
pub enum Token { | |
BinOp(Op), | |
Num(i32), | |
LeftParen, | |
RightParen, | |
} | |
/// An expression returned from the parser. | |
/// Because the parser is only for simple number operations, | |
/// an expression can only be an integer number or a binary operation. | |
#[derive(Debug, PartialEq)] | |
pub enum Expr { | |
Value(i32), | |
BinOp(Op, Box<Expr>, Box<Expr>), | |
} | |
struct OpInfo { | |
prec: i32, | |
assoc: Assoc, | |
} | |
/// Global "constants". | |
lazy_static! { | |
/// Regex used by the tokenizer to parse numbers and operators. | |
static ref REGEX: Regex = Regex::new(r###"\s*(?:(\d+)|(.))"###).unwrap(); | |
/// Table of operator to the operator's precedence and association. | |
static ref OP_INFO: HashMap<Op, OpInfo> = { | |
let mut map = HashMap::new(); | |
map.insert(Op::Plus, OpInfo { prec: 1, assoc: Assoc::Left }); | |
map.insert(Op::Minus, OpInfo { prec: 1, assoc: Assoc::Left }); | |
map.insert(Op::Times, OpInfo { prec: 2, assoc: Assoc::Left }); | |
map.insert(Op::Divide, OpInfo { prec: 2, assoc: Assoc::Left }); | |
map.insert(Op::Exponent, OpInfo { prec: 3, assoc: Assoc::Right }); | |
map | |
}; | |
} | |
/// A cursor through valid tokens in a string that moves from left to right. | |
/// Example: 1 + (2 * 3) | |
/// | |
/// Gets split into tokens and the current position moves to the right | |
/// starting from an empty invalid position. | |
/// | |
/// [1] [+] [(] [2] [*] [3] [)] | |
/// ^ | |
/// | | |
/// curr --------------------------> | |
pub struct Tokenizer<'a> { | |
curr_token: Option<Token>, | |
tokens: CaptureMatches<'static, 'a>, | |
} | |
impl<'a> Tokenizer<'a> { | |
pub fn new(text: &'a str) -> Tokenizer<'a> { | |
Tokenizer { | |
curr_token: None, | |
tokens: REGEX.captures_iter(text), | |
} | |
} | |
/// Returns the token the tokenizer is currently on. | |
pub fn curr_token(&self) -> Option<Token> { | |
self.curr_token | |
} | |
/// Moves the position to the next token. | |
pub fn get_next_token(&mut self) -> TokenResult<()> { | |
if let Some(token) = self.tokens.next() { | |
let token = match (token.get(1), token.get(2)) { | |
(Some(num), _) => Token::Num(i32::from_str(num.as_str())?), | |
(_, Some(op)) if op.as_str() == "(" => Token::LeftParen, | |
(_, Some(op)) if op.as_str() == ")" => Token::RightParen, | |
(_, Some(op)) if Op::from_str(op.as_str()).is_ok() => { | |
let op = Op::from_str(op.as_str())?; | |
Token::BinOp(op) | |
} | |
_ => return Err(TokenErr::InvalidToken), | |
}; | |
self.curr_token = Some(token); | |
} | |
Ok(()) | |
} | |
} | |
/// A precedence climbing parser for | |
/// parsing simple mathmatical expressions. | |
pub struct Parser<'a> { | |
tokenizer: Tokenizer<'a>, | |
} | |
impl<'a> Parser<'a> { | |
pub fn new(text: &'a str) -> ParseResult<Parser<'a>> { | |
let mut tokenizer = Tokenizer::new(text); | |
tokenizer.get_next_token()?; | |
Ok(Parser { tokenizer: tokenizer }) | |
} | |
/// Parses a value which can be on either side of an operator expression, | |
/// and moves the tokenizer position to the token after the value. | |
/// | |
/// Examples: | |
/// | |
/// 1 * 2 + 3 => 1 * 2 + 3 , returns Expr::value(1) | |
/// ^ ^ | |
/// | | | |
/// token pos token pos | |
/// | |
/// 1 + (2 * 3) + 5 => 1 + (2 * 3) + 5 , returns Expr::BinOp( | |
/// ^ ^ Op::Times, | |
/// | | Box::new(Expr::Value(2)), | |
/// token pos token pos Box::new(Expr::Value(3)) | |
/// ) | |
pub fn parse_value(&mut self) -> ParseResult<Expr> { | |
match self.tokenizer.curr_token() { | |
Some(Token::LeftParen) => { | |
self.tokenizer.get_next_token()?; | |
let value_expr = self.parse_expr(1)?; | |
if self.tokenizer.curr_token() != Some(Token::RightParen) { | |
return Err(ParseErr::UnmatchedParen); | |
} | |
self.tokenizer.get_next_token()?; | |
Ok(value_expr) | |
} | |
Some(Token::Num(num)) => { | |
self.tokenizer.get_next_token()?; | |
Ok(Expr::Value(num)) | |
} | |
Some(_) => Err(ParseErr::ExpectedValue), | |
None => Err(ParseErr::UnexpectedEnd), | |
} | |
} | |
/// Parses an binary operator expression. | |
/// | |
/// One reason why parsing operators with precedence looks tricky is | |
/// because at a certain position you can't know if there's an operator | |
/// further ahead with a higher precedence. | |
/// | |
/// For example, parsing 1 + 2 * 3 + 5 while being at the '+' position: | |
/// ``` | |
/// 1 + 2 * 3 + 5 <--- how to know if theres an operator further ahead | |
/// ^ of the '+' that has higher precedence? | |
/// ``` | |
/// | |
/// Recursion is the natural solution to this problem. At the '+' you don't | |
/// care about the right hand side, you expect the recursive call to properly | |
/// handle the precedence. | |
/// | |
/// ``` | |
/// 1 + (recursive function yielding an expression) ......(extra stuff) | |
/// ^ | |
/// ``` | |
/// | |
/// In this case the recursive function should yield (2 * 3) because (2 * 3) has higher | |
/// precedence and '+' is left associative so 1 has to be added to (2 * 3) before 5 is added. | |
/// | |
/// The new expression 1 + (2 * 3) has proper precedence but there is still the + 5 left. | |
/// So (1 + (2 * 3)) is made the new left value and '+' the new operator and it repeats: | |
/// | |
/// ``` | |
/// (1 + (2 * 3)) + (recursive function yielding an expression) ....(extra stuff) | |
/// ``` | |
/// | |
/// This repeats until there is no more valid binary operators. | |
/// | |
/// This function needs to take the minimum precedence so that the recursive calls can | |
/// know when to return early. The minimum precedence starts at 1 in the base call and | |
/// increases in the recursive calls based on the operator encountered. | |
pub fn parse_expr(&mut self, min_prec: i32) -> ParseResult<Expr> { | |
let mut left_value = self.parse_value()?; | |
// Continually link the left hand side to the recursively computed | |
// right hand side until there is a precedence conflict or an invalid token. | |
loop { | |
let token = self.tokenizer.curr_token(); | |
let op = match token { | |
Some(Token::BinOp(ref op)) if OP_INFO[op].prec < min_prec => break, | |
Some(Token::BinOp(op)) => op, | |
_ => break, | |
}; | |
let OpInfo { prec, assoc } = OP_INFO[&op]; | |
// Left associative operators will pass in precedence + 1 in order | |
// to prevent same precedence operators on the right from beating them. | |
// Example: Precedence for '+' is 1. 1 + 1 = 2 which only allows '*', '/', and '^' | |
// to beat it since they are >= 2. | |
let next_min_prec = if assoc == Assoc::Left { prec + 1 } else { prec }; | |
// Recursively compute the right hand expression and finish the binop expression. | |
self.tokenizer.get_next_token()?; | |
let right_value = self.parse_expr(next_min_prec)?; | |
left_value = Expr::BinOp(op, Box::new(left_value), Box::new(right_value)); | |
} | |
Ok(left_value) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment