Skip to content

Instantly share code, notes, and snippets.

@DarinM223
Last active April 28, 2017 21:20
Show Gist options
  • Save DarinM223/15b168245107dbc2c9d10b57d3ac08f4 to your computer and use it in GitHub Desktop.
Save DarinM223/15b168245107dbc2c9d10b57d3ac08f4 to your computer and use it in GitHub Desktop.
Regex
#[macro_use]
extern crate log;
extern crate env_logger;
use std::borrow::Cow;
use std::collections::HashSet;
use std::mem;
use std::ptr;
use std::result;
use std::string::FromUtf8Error;
/// Error states that can happen while evaluating regex.
#[derive(Debug)]
pub enum RegexError {
Utf8Error(FromUtf8Error),
/// Error when attempting to pop a value from an empty stack.
StackEmpty,
/// Error when a stack is not empty after popping
/// what should be the last element.
StackNotEmptyAtEnd,
ParenthesisOverflow,
EndParenthesisWithoutBeginning,
NAtomIsZero, // TODO(DarinM223): make this more understandable
DanglingParenthesisAtEnd,
}
impl From<FromUtf8Error> for RegexError {
fn from(err: FromUtf8Error) -> RegexError {
RegexError::Utf8Error(err)
}
}
/// Result type for evaluating regex.
pub type Result<T> = result::Result<T, RegexError>;
/// NFA graph implementation.
#[derive(Debug)]
pub enum State {
/// A single transition to another state given a character input c.
/// c
/// () ---> out
Link { ch: u8, out: *mut State },
/// A split with a choice between out1 or out2.
/// Both transitions are epsilon so the NFA will have to perfectly guess which route
/// to take (which can be done by backtracking).
/// ---> out1
/// /
/// ()
/// \
/// ---> out2
Split { out1: *mut State, out2: *mut State },
/// A matching or accepting state.
/// (0)
Match,
}
impl State {
/// Allocates a state in memory and returns a new pointer to it.
pub unsafe fn new(state: State) -> *mut State {
let boxed_state = Box::new(state);
mem::transmute::<Box<State>, *mut State>(boxed_state)
}
/// Frees the state graph given a pointer to the start node.
pub unsafe fn free(state: *mut State) {
State::free_rec(state, &mut HashSet::new())
}
/// Helper function for State::free() that tracks already freed nodes to prevent
/// accidentally freeing the same pointer multiple times.
unsafe fn free_rec(state: *mut State, freed: &mut HashSet<*mut State>) {
if freed.contains(&state) {
return;
}
// Mark state as freed before recursing so that the child nodes
// won't try to free the parent node prematurely.
freed.insert(state);
match *state {
State::Link { out, .. } if !out.is_null() => State::free_rec(out, freed),
State::Split { out1, out2 } => {
if !out1.is_null() {
State::free_rec(out1, freed);
}
if !out1.is_null() {
State::free_rec(out2, freed);
}
}
_ => {}
}
mem::transmute::<*mut State, Box<State>>(state);
}
}
/// An incomplete graph of NFA states that contains dangling pointers that
/// need to be connected to another state.
pub struct Fragment {
/// The start state for the fragment.
start: *mut State,
/// A list of pointers to State pointers that are
/// not yet connected to anything.
unconnected_states: Vec<*mut *mut State>,
}
impl Fragment {
pub fn new(start: *mut State, out: Vec<*mut *mut State>) -> Fragment {
Fragment {
start: start,
unconnected_states: out,
}
}
/// Sets all the unconnected state pointers to point to the given state.
pub unsafe fn patch(&mut self, s: *mut State) {
for &state_ptr in self.unconnected_states.iter() {
(*state_ptr) = s;
}
}
}
/// Matches a string to a regular expression string and
/// returns a Result<bool> indicating whether the
/// string matched, didn't match, or caused an error.
pub fn regex_match<'a, 'b, S1, S2>(regex: S1, match_str: S2) -> Result<bool>
where S1: Into<Cow<'a, str>>,
S2: Into<Cow<'b, str>>
{
unsafe {
let postfix = re2post(regex)?;
debug!("Postfix: {:?}", postfix);
let start_state = post2nfa(postfix)?;
let result = simulate_nfa(start_state, match_str);
State::free(start_state);
Ok(result)
}
}
/// Converts a regular expression into postfix form.
pub fn re2post<'a, S>(regex: S) -> Result<String>
where S: Into<Cow<'a, str>>
{
#[derive(Clone, Copy)]
struct Paren {
nalt: i32,
natom: i32,
}
let regex = regex.into().into_owned();
let mut postfix = Vec::with_capacity(8000);
const PAREN_TOTAL_SIZE: usize = 100;
let mut parens = [Paren { nalt: 0, natom: 0 }; PAREN_TOTAL_SIZE];
let mut paren_idx = 0;
// TODO(DarinM223): What the hell are these things?
let mut nalt = 0;
let mut natom = 0;
for byte in regex.bytes() {
match byte {
b'(' => {
if natom > 1 {
natom -= 1;
postfix.push(b'.');
}
if paren_idx >= PAREN_TOTAL_SIZE {
return Err(RegexError::ParenthesisOverflow);
}
parens[paren_idx].nalt = nalt;
parens[paren_idx].natom = natom;
paren_idx += 1;
nalt = 0;
natom = 0;
}
b'|' => {
if natom == 0 {
return Err(RegexError::NAtomIsZero);
}
natom -= 1;
while natom > 0 {
postfix.push(b'.');
natom -= 1;
}
nalt += 1;
}
b')' => {
if paren_idx == 0 {
return Err(RegexError::EndParenthesisWithoutBeginning);
}
if natom == 0 {
return Err(RegexError::NAtomIsZero);
}
natom -= 1;
while natom > 0 {
postfix.push(b'.');
natom -= 1;
}
while nalt > 0 {
postfix.push(b'|');
nalt -= 1;
}
paren_idx -= 1;
nalt = parens[paren_idx].nalt;
natom = parens[paren_idx].natom;
natom += 1;
}
b'*' | b'+' | b'?' => {
if natom == 0 {
return Err(RegexError::NAtomIsZero);
}
postfix.push(byte);
}
_ => {
if natom > 1 {
natom -= 1;
postfix.push(b'.');
}
postfix.push(byte);
natom += 1;
}
}
}
if paren_idx != 0 {
return Err(RegexError::DanglingParenthesisAtEnd);
}
natom -= 1;
while natom > 0 {
postfix.push(b'.');
natom -= 1;
}
while nalt > 0 {
postfix.push(b'|');
nalt -= 1;
}
Ok(String::from_utf8(postfix)?)
}
/// Converts a regular expression in postfix form into an NFA state graph.
pub unsafe fn post2nfa<'a, S>(regex_postfix: S) -> Result<*mut State>
where S: Into<Cow<'a, str>>
{
let postfix = regex_postfix.into().into_owned();
// A stack of computed NFA fragments.
// Literals push new NFA fragments onto the stack,
// while operators pop fragments off the stack
// and then push a new fragment.
let mut stack: Vec<Fragment> = Vec::with_capacity(1000);
for byte in postfix.bytes() {
match byte {
//
// >(e1)--->(e2)---> ?
//
b'.' => {
debug!("Handling '.'");
let mut e2 = pop(&mut stack)?;
let mut e1 = pop(&mut stack)?;
e1.patch(e2.start);
// Unconnected states = e2's unconnected states
let unconnected_states = mem::replace(&mut e2.unconnected_states, vec![]);
stack.push(Fragment::new(e1.start, unconnected_states));
}
//
// --->(e1)---> ?
// /
// >()
// \
// --->(e2)---> ?
//
b'|' => {
debug!("Handling '|'");
let mut e2 = pop(&mut stack)?;
let mut e1 = pop(&mut stack)?;
let s = State::new(State::Split {
out1: e1.start,
out2: e2.start,
});
// Unconnected states = e1's unconnected states + e2's unconnected states
let mut unconnected_states = mem::replace(&mut e1.unconnected_states, vec![]);
unconnected_states.append(&mut e2.unconnected_states);
stack.push(Fragment::new(s, unconnected_states));
}
//
// --->(e)---> ?
// /
// >()
// \
// ----------> ?
//
b'?' => {
debug!("Handling '?'");
let mut e = pop(&mut stack)?;
let s = State::new(State::Split {
out1: e.start,
out2: ptr::null_mut(),
});
// Unconnected states = e's unconnected states + out2
let mut unconnected_states = mem::replace(&mut e.unconnected_states, vec![]);
if let State::Split { ref mut out2, .. } = *s {
unconnected_states.push(out2 as *mut *mut State);
}
stack.push(Fragment::new(s, unconnected_states));
}
//
// --->(e)---
// / \
// / /
// >()<-----------
// \
// ------------> ?
//
b'*' => {
debug!("Handling '*'");
let mut e = pop(&mut stack)?;
let s = State::new(State::Split {
out1: e.start,
out2: ptr::null_mut(),
});
e.patch(s);
// Unconnected state = out2
let mut unconnected_states = vec![];
if let State::Split { ref mut out2, .. } = *s {
unconnected_states.push(out2 as *mut *mut State);
}
stack.push(Fragment::new(s, unconnected_states));
}
//
// --------
// / \
// | |
// v /
// >(e)---->()---> ?
//
b'+' => {
debug!("Handling '+'");
let mut e = pop(&mut stack)?;
let s = State::new(State::Split {
out1: e.start,
out2: ptr::null_mut(),
});
e.patch(s);
// Unconnected state = out2
let mut unconnected_states = vec![];
if let State::Split { ref mut out2, .. } = *s {
unconnected_states.push(out2 as *mut *mut State);
}
stack.push(Fragment::new(e.start, unconnected_states));
}
//
// ch
// >()---> ?
//
ch => {
debug!("Handling '{}'", ch as char);
let s = State::new(State::Link {
ch: ch,
out: ptr::null_mut(),
});
// Unconnected state = out
let mut unconnected_states = vec![];
if let State::Link { ref mut out, .. } = *s {
unconnected_states.push(out as *mut *mut State);
}
stack.push(Fragment::new(s, unconnected_states));
}
}
}
let mut e = pop(&mut stack)?;
if !stack.is_empty() {
return Err(RegexError::StackNotEmptyAtEnd);
}
// Connect final unconnected states to match state.
let match_state = State::new(State::Match);
e.patch(match_state);
Ok(e.start)
}
/// Simulates an NFA on a given string.
pub unsafe fn simulate_nfa<'a, S>(start: *mut State, s: S) -> bool
where S: Into<Cow<'a, str>>
{
let s = s.into().into_owned();
let mut curr_states = Vec::new();
let mut next_states = Vec::new();
debug!("Start state: {:?}", *start);
add_state(&mut curr_states, start);
for byte in s.bytes() {
let c = byte & 0xFF;
debug!("Current states: {:?}", curr_states);
debug!("Next states: {:?}", next_states);
debug!("Char: {}", c as char);
step_nfa(c, &mut curr_states, &mut next_states);
mem::swap(&mut curr_states, &mut next_states);
}
// Return true if there is a Match state in the current states.
for state in curr_states {
if let State::Match = *state {
return true;
}
}
false
}
/// Recursively adds all Link or Match states for a given start state to the list of states.
/// It doesn't add null state pointers or states that already exist in the list.
unsafe fn add_state(states: &mut Vec<*mut State>, state: *mut State) {
if state.is_null() || states.contains(&state) {
return;
}
match *state {
State::Link { .. } |
State::Match => {
debug!("Adding state {:?}", *state);
states.push(state);
}
State::Split { out1, out2 } => {
debug!("Adding split state");
add_state(states, out1);
add_state(states, out2);
}
}
}
/// Populates the next list of states given the current list of states and a character input.
unsafe fn step_nfa(c: u8, curr_states: &mut Vec<*mut State>, next_states: &mut Vec<*mut State>) {
next_states.clear();
for &state in curr_states.iter() {
if let State::Link { ch, out, .. } = *state {
if ch == c {
add_state(next_states, out);
}
}
}
}
/// Helper function for popping a value from a Vec stack that
/// returns a regex Result instead of an Option.
fn pop(stack: &mut Vec<Fragment>) -> Result<Fragment> {
match stack.pop() {
Some(frag) => Ok(frag),
None => Err(RegexError::StackEmpty),
}
}
fn main() {
env_logger::init().unwrap();
assert_eq!(regex_match("a?a?a?aaa", "aaa").unwrap(), true);
assert_eq!(regex_match("a?a?a?aaa", "aab").unwrap(), false);
assert_eq!(regex_match("a?a?a?aaa", "aaaaa").unwrap(), true);
println!("Tests finished successfully!");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment