Last active
April 28, 2017 21:20
-
-
Save DarinM223/15b168245107dbc2c9d10b57d3ac08f4 to your computer and use it in GitHub Desktop.
Regex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[macro_use] | |
extern crate log; | |
extern crate env_logger; | |
use std::borrow::Cow; | |
use std::collections::HashSet; | |
use std::mem; | |
use std::ptr; | |
use std::result; | |
use std::string::FromUtf8Error; | |
/// Error states that can happen while evaluating regex. | |
#[derive(Debug)] | |
pub enum RegexError { | |
Utf8Error(FromUtf8Error), | |
/// Error when attempting to pop a value from an empty stack. | |
StackEmpty, | |
/// Error when a stack is not empty after popping | |
/// what should be the last element. | |
StackNotEmptyAtEnd, | |
ParenthesisOverflow, | |
EndParenthesisWithoutBeginning, | |
NAtomIsZero, // TODO(DarinM223): make this more understandable | |
DanglingParenthesisAtEnd, | |
} | |
impl From<FromUtf8Error> for RegexError { | |
fn from(err: FromUtf8Error) -> RegexError { | |
RegexError::Utf8Error(err) | |
} | |
} | |
/// Result type for evaluating regex. | |
pub type Result<T> = result::Result<T, RegexError>; | |
/// NFA graph implementation. | |
#[derive(Debug)] | |
pub enum State { | |
/// A single transition to another state given a character input c. | |
/// c | |
/// () ---> out | |
Link { ch: u8, out: *mut State }, | |
/// A split with a choice between out1 or out2. | |
/// Both transitions are epsilon so the NFA will have to perfectly guess which route | |
/// to take (which can be done by backtracking). | |
/// ---> out1 | |
/// / | |
/// () | |
/// \ | |
/// ---> out2 | |
Split { out1: *mut State, out2: *mut State }, | |
/// A matching or accepting state. | |
/// (0) | |
Match, | |
} | |
impl State { | |
/// Allocates a state in memory and returns a new pointer to it. | |
pub unsafe fn new(state: State) -> *mut State { | |
let boxed_state = Box::new(state); | |
mem::transmute::<Box<State>, *mut State>(boxed_state) | |
} | |
/// Frees the state graph given a pointer to the start node. | |
pub unsafe fn free(state: *mut State) { | |
State::free_rec(state, &mut HashSet::new()) | |
} | |
/// Helper function for State::free() that tracks already freed nodes to prevent | |
/// accidentally freeing the same pointer multiple times. | |
unsafe fn free_rec(state: *mut State, freed: &mut HashSet<*mut State>) { | |
if freed.contains(&state) { | |
return; | |
} | |
// Mark state as freed before recursing so that the child nodes | |
// won't try to free the parent node prematurely. | |
freed.insert(state); | |
match *state { | |
State::Link { out, .. } if !out.is_null() => State::free_rec(out, freed), | |
State::Split { out1, out2 } => { | |
if !out1.is_null() { | |
State::free_rec(out1, freed); | |
} | |
if !out1.is_null() { | |
State::free_rec(out2, freed); | |
} | |
} | |
_ => {} | |
} | |
mem::transmute::<*mut State, Box<State>>(state); | |
} | |
} | |
/// An incomplete graph of NFA states that contains dangling pointers that | |
/// need to be connected to another state. | |
pub struct Fragment { | |
/// The start state for the fragment. | |
start: *mut State, | |
/// A list of pointers to State pointers that are | |
/// not yet connected to anything. | |
unconnected_states: Vec<*mut *mut State>, | |
} | |
impl Fragment { | |
pub fn new(start: *mut State, out: Vec<*mut *mut State>) -> Fragment { | |
Fragment { | |
start: start, | |
unconnected_states: out, | |
} | |
} | |
/// Sets all the unconnected state pointers to point to the given state. | |
pub unsafe fn patch(&mut self, s: *mut State) { | |
for &state_ptr in self.unconnected_states.iter() { | |
(*state_ptr) = s; | |
} | |
} | |
} | |
/// Matches a string to a regular expression string and | |
/// returns a Result<bool> indicating whether the | |
/// string matched, didn't match, or caused an error. | |
pub fn regex_match<'a, 'b, S1, S2>(regex: S1, match_str: S2) -> Result<bool> | |
where S1: Into<Cow<'a, str>>, | |
S2: Into<Cow<'b, str>> | |
{ | |
unsafe { | |
let postfix = re2post(regex)?; | |
debug!("Postfix: {:?}", postfix); | |
let start_state = post2nfa(postfix)?; | |
let result = simulate_nfa(start_state, match_str); | |
State::free(start_state); | |
Ok(result) | |
} | |
} | |
/// Converts a regular expression into postfix form. | |
pub fn re2post<'a, S>(regex: S) -> Result<String> | |
where S: Into<Cow<'a, str>> | |
{ | |
#[derive(Clone, Copy)] | |
struct Paren { | |
nalt: i32, | |
natom: i32, | |
} | |
let regex = regex.into().into_owned(); | |
let mut postfix = Vec::with_capacity(8000); | |
const PAREN_TOTAL_SIZE: usize = 100; | |
let mut parens = [Paren { nalt: 0, natom: 0 }; PAREN_TOTAL_SIZE]; | |
let mut paren_idx = 0; | |
// TODO(DarinM223): What the hell are these things? | |
let mut nalt = 0; | |
let mut natom = 0; | |
for byte in regex.bytes() { | |
match byte { | |
b'(' => { | |
if natom > 1 { | |
natom -= 1; | |
postfix.push(b'.'); | |
} | |
if paren_idx >= PAREN_TOTAL_SIZE { | |
return Err(RegexError::ParenthesisOverflow); | |
} | |
parens[paren_idx].nalt = nalt; | |
parens[paren_idx].natom = natom; | |
paren_idx += 1; | |
nalt = 0; | |
natom = 0; | |
} | |
b'|' => { | |
if natom == 0 { | |
return Err(RegexError::NAtomIsZero); | |
} | |
natom -= 1; | |
while natom > 0 { | |
postfix.push(b'.'); | |
natom -= 1; | |
} | |
nalt += 1; | |
} | |
b')' => { | |
if paren_idx == 0 { | |
return Err(RegexError::EndParenthesisWithoutBeginning); | |
} | |
if natom == 0 { | |
return Err(RegexError::NAtomIsZero); | |
} | |
natom -= 1; | |
while natom > 0 { | |
postfix.push(b'.'); | |
natom -= 1; | |
} | |
while nalt > 0 { | |
postfix.push(b'|'); | |
nalt -= 1; | |
} | |
paren_idx -= 1; | |
nalt = parens[paren_idx].nalt; | |
natom = parens[paren_idx].natom; | |
natom += 1; | |
} | |
b'*' | b'+' | b'?' => { | |
if natom == 0 { | |
return Err(RegexError::NAtomIsZero); | |
} | |
postfix.push(byte); | |
} | |
_ => { | |
if natom > 1 { | |
natom -= 1; | |
postfix.push(b'.'); | |
} | |
postfix.push(byte); | |
natom += 1; | |
} | |
} | |
} | |
if paren_idx != 0 { | |
return Err(RegexError::DanglingParenthesisAtEnd); | |
} | |
natom -= 1; | |
while natom > 0 { | |
postfix.push(b'.'); | |
natom -= 1; | |
} | |
while nalt > 0 { | |
postfix.push(b'|'); | |
nalt -= 1; | |
} | |
Ok(String::from_utf8(postfix)?) | |
} | |
/// Converts a regular expression in postfix form into an NFA state graph. | |
pub unsafe fn post2nfa<'a, S>(regex_postfix: S) -> Result<*mut State> | |
where S: Into<Cow<'a, str>> | |
{ | |
let postfix = regex_postfix.into().into_owned(); | |
// A stack of computed NFA fragments. | |
// Literals push new NFA fragments onto the stack, | |
// while operators pop fragments off the stack | |
// and then push a new fragment. | |
let mut stack: Vec<Fragment> = Vec::with_capacity(1000); | |
for byte in postfix.bytes() { | |
match byte { | |
// | |
// >(e1)--->(e2)---> ? | |
// | |
b'.' => { | |
debug!("Handling '.'"); | |
let mut e2 = pop(&mut stack)?; | |
let mut e1 = pop(&mut stack)?; | |
e1.patch(e2.start); | |
// Unconnected states = e2's unconnected states | |
let unconnected_states = mem::replace(&mut e2.unconnected_states, vec![]); | |
stack.push(Fragment::new(e1.start, unconnected_states)); | |
} | |
// | |
// --->(e1)---> ? | |
// / | |
// >() | |
// \ | |
// --->(e2)---> ? | |
// | |
b'|' => { | |
debug!("Handling '|'"); | |
let mut e2 = pop(&mut stack)?; | |
let mut e1 = pop(&mut stack)?; | |
let s = State::new(State::Split { | |
out1: e1.start, | |
out2: e2.start, | |
}); | |
// Unconnected states = e1's unconnected states + e2's unconnected states | |
let mut unconnected_states = mem::replace(&mut e1.unconnected_states, vec![]); | |
unconnected_states.append(&mut e2.unconnected_states); | |
stack.push(Fragment::new(s, unconnected_states)); | |
} | |
// | |
// --->(e)---> ? | |
// / | |
// >() | |
// \ | |
// ----------> ? | |
// | |
b'?' => { | |
debug!("Handling '?'"); | |
let mut e = pop(&mut stack)?; | |
let s = State::new(State::Split { | |
out1: e.start, | |
out2: ptr::null_mut(), | |
}); | |
// Unconnected states = e's unconnected states + out2 | |
let mut unconnected_states = mem::replace(&mut e.unconnected_states, vec![]); | |
if let State::Split { ref mut out2, .. } = *s { | |
unconnected_states.push(out2 as *mut *mut State); | |
} | |
stack.push(Fragment::new(s, unconnected_states)); | |
} | |
// | |
// --->(e)--- | |
// / \ | |
// / / | |
// >()<----------- | |
// \ | |
// ------------> ? | |
// | |
b'*' => { | |
debug!("Handling '*'"); | |
let mut e = pop(&mut stack)?; | |
let s = State::new(State::Split { | |
out1: e.start, | |
out2: ptr::null_mut(), | |
}); | |
e.patch(s); | |
// Unconnected state = out2 | |
let mut unconnected_states = vec![]; | |
if let State::Split { ref mut out2, .. } = *s { | |
unconnected_states.push(out2 as *mut *mut State); | |
} | |
stack.push(Fragment::new(s, unconnected_states)); | |
} | |
// | |
// -------- | |
// / \ | |
// | | | |
// v / | |
// >(e)---->()---> ? | |
// | |
b'+' => { | |
debug!("Handling '+'"); | |
let mut e = pop(&mut stack)?; | |
let s = State::new(State::Split { | |
out1: e.start, | |
out2: ptr::null_mut(), | |
}); | |
e.patch(s); | |
// Unconnected state = out2 | |
let mut unconnected_states = vec![]; | |
if let State::Split { ref mut out2, .. } = *s { | |
unconnected_states.push(out2 as *mut *mut State); | |
} | |
stack.push(Fragment::new(e.start, unconnected_states)); | |
} | |
// | |
// ch | |
// >()---> ? | |
// | |
ch => { | |
debug!("Handling '{}'", ch as char); | |
let s = State::new(State::Link { | |
ch: ch, | |
out: ptr::null_mut(), | |
}); | |
// Unconnected state = out | |
let mut unconnected_states = vec![]; | |
if let State::Link { ref mut out, .. } = *s { | |
unconnected_states.push(out as *mut *mut State); | |
} | |
stack.push(Fragment::new(s, unconnected_states)); | |
} | |
} | |
} | |
let mut e = pop(&mut stack)?; | |
if !stack.is_empty() { | |
return Err(RegexError::StackNotEmptyAtEnd); | |
} | |
// Connect final unconnected states to match state. | |
let match_state = State::new(State::Match); | |
e.patch(match_state); | |
Ok(e.start) | |
} | |
/// Simulates an NFA on a given string. | |
pub unsafe fn simulate_nfa<'a, S>(start: *mut State, s: S) -> bool | |
where S: Into<Cow<'a, str>> | |
{ | |
let s = s.into().into_owned(); | |
let mut curr_states = Vec::new(); | |
let mut next_states = Vec::new(); | |
debug!("Start state: {:?}", *start); | |
add_state(&mut curr_states, start); | |
for byte in s.bytes() { | |
let c = byte & 0xFF; | |
debug!("Current states: {:?}", curr_states); | |
debug!("Next states: {:?}", next_states); | |
debug!("Char: {}", c as char); | |
step_nfa(c, &mut curr_states, &mut next_states); | |
mem::swap(&mut curr_states, &mut next_states); | |
} | |
// Return true if there is a Match state in the current states. | |
for state in curr_states { | |
if let State::Match = *state { | |
return true; | |
} | |
} | |
false | |
} | |
/// Recursively adds all Link or Match states for a given start state to the list of states. | |
/// It doesn't add null state pointers or states that already exist in the list. | |
unsafe fn add_state(states: &mut Vec<*mut State>, state: *mut State) { | |
if state.is_null() || states.contains(&state) { | |
return; | |
} | |
match *state { | |
State::Link { .. } | | |
State::Match => { | |
debug!("Adding state {:?}", *state); | |
states.push(state); | |
} | |
State::Split { out1, out2 } => { | |
debug!("Adding split state"); | |
add_state(states, out1); | |
add_state(states, out2); | |
} | |
} | |
} | |
/// Populates the next list of states given the current list of states and a character input. | |
unsafe fn step_nfa(c: u8, curr_states: &mut Vec<*mut State>, next_states: &mut Vec<*mut State>) { | |
next_states.clear(); | |
for &state in curr_states.iter() { | |
if let State::Link { ch, out, .. } = *state { | |
if ch == c { | |
add_state(next_states, out); | |
} | |
} | |
} | |
} | |
/// Helper function for popping a value from a Vec stack that | |
/// returns a regex Result instead of an Option. | |
fn pop(stack: &mut Vec<Fragment>) -> Result<Fragment> { | |
match stack.pop() { | |
Some(frag) => Ok(frag), | |
None => Err(RegexError::StackEmpty), | |
} | |
} | |
fn main() { | |
env_logger::init().unwrap(); | |
assert_eq!(regex_match("a?a?a?aaa", "aaa").unwrap(), true); | |
assert_eq!(regex_match("a?a?a?aaa", "aab").unwrap(), false); | |
assert_eq!(regex_match("a?a?a?aaa", "aaaaa").unwrap(), true); | |
println!("Tests finished successfully!"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment