Created
October 27, 2022 14:17
-
-
Save mbillingr/3964d8f80d3a4311f443b125794353f8 to your computer and use it in GitHub Desktop.
Table based scanner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::collections::HashMap; | |
use std::hash::Hash; | |
pub type Result<T> = std::result::Result<T, Span>; | |
pub struct Scanner<'i, T: Clone> { | |
classifier_table: DefaultTable<char, usize>, | |
transition_table: Table<usize, Vec<usize>>, | |
accepting_states: Table<usize, T>, | |
input: &'i str, | |
cursor: usize, | |
} | |
impl<'i, T: Clone> Scanner<'i, T> { | |
pub fn new( | |
input: &'i str, | |
classifier_table: DefaultTable<char, usize>, | |
transition_table: Table<usize, Vec<usize>>, | |
accepting_states: Table<usize, T>, | |
) -> Self { | |
assert!(transition_table.contains(0)); | |
Scanner { | |
classifier_table, | |
transition_table, | |
accepting_states, | |
input, | |
cursor: 0, | |
} | |
} | |
pub fn reuse<'a>(self, input: &'a str) -> Scanner<'a, T> { | |
Scanner { input, cursor: 0, ..self } | |
} | |
pub fn next_word(&mut self) -> Result<Token<T>> { | |
let start = self.cursor; | |
let mut state = 0; | |
let mut stack = vec![]; | |
let end_state = self.last_state(); | |
while state != end_state { | |
let ch = match self.next_char() { | |
Some(ch) => ch, | |
None => break, | |
}; | |
if self.accepting_states.contains(state) { | |
stack.clear(); | |
} | |
stack.push(state); | |
print!("{state}"); | |
let cls = self.classifier_table.lookup(ch); | |
state = self.transition_table.lookup(state)[*cls]; | |
println!(" ==({ch})==> {state}"); | |
} | |
let final_pos = self.cursor; | |
while !self.accepting_states.contains(state) { | |
print!("{state}"); | |
state = stack | |
.pop() | |
.ok_or(Span::from_start_end(self.cursor, final_pos))?; | |
println!(" ==xxx==> {state}"); | |
self.rollback(); | |
} | |
Ok(Token { | |
span: Span::from_start_end(start, self.cursor), | |
kind: self.accepting_states.lookup(state).clone(), | |
}) | |
} | |
fn next_char(&mut self) -> Option<char> { | |
let ch = self.input[self.cursor..].chars().next()?; | |
self.cursor += ch.len_utf8(); | |
Some(ch) | |
} | |
fn rollback(&mut self) { | |
let (idx, _) = self.input[..self.cursor].char_indices().last().unwrap(); | |
self.cursor = idx; | |
} | |
fn last_state(&self) -> usize { | |
*self.transition_table.keys().max().unwrap() | |
} | |
} | |
#[derive(Debug, PartialEq)] | |
pub struct Token<T> { | |
pub span: Span, | |
pub kind: T, | |
} | |
#[derive(Debug, PartialEq)] | |
pub struct Span { | |
start: usize, | |
end: usize, | |
} | |
impl Span { | |
pub fn from_start_end(start: usize, end: usize) -> Self { | |
Span { start, end } | |
} | |
} | |
pub struct Table<K, V>(std::collections::HashMap<K, V>); | |
impl<K: Eq + Hash, V> Table<K, V> { | |
pub fn new(entries: impl IntoIterator<Item = (K, V)>) -> Self { | |
Table(entries.into_iter().collect()) | |
} | |
pub fn lookup(&self, key: K) -> &V { | |
&self.0[&key] | |
} | |
pub fn contains(&self, key: K) -> bool { | |
self.0.contains_key(&key) | |
} | |
pub fn keys(&self) -> impl Iterator<Item = &K> { | |
self.0.keys() | |
} | |
} | |
pub struct DefaultTable<K, V>(std::collections::HashMap<K, V>, V); | |
impl<K: Eq + Hash, V> DefaultTable<K, V> { | |
pub fn new(entries: impl IntoIterator<Item = (K, V)>, other: V) -> Self { | |
DefaultTable(entries.into_iter().collect(), other) | |
} | |
pub fn lookup(&self, key: K) -> &V { | |
self.0.get(&key).unwrap_or(&self.1) | |
} | |
pub fn contains(&self, key: K) -> bool { | |
self.0.contains_key(&key) | |
} | |
} | |
#[test] | |
fn degenerate_scanner() { | |
let classes = DefaultTable::new([], 0); | |
let transitions = Table::new([(0, vec![0])]); | |
let accepting_states = Table::new([(0, ())]); | |
let mut scanner = Scanner::new("", classes, transitions, accepting_states); | |
let result = scanner.next_word(); | |
assert_eq!( | |
result, | |
Ok(Token { | |
span: Span::from_start_end(0, 0), | |
kind: () | |
}) | |
); | |
} | |
#[test] | |
fn simple_scan() { | |
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2); | |
let transitions = Table::new([ | |
(0, vec![1, 3, 3]), | |
(1, vec![3, 2, 3]), | |
(2, vec![3, 3, 3]), | |
(3, vec![3, 3, 3]), | |
]); | |
let accepting_states = Table::new([(2, ())]); | |
let mut scanner = Scanner::new("r1", classes, transitions, accepting_states); | |
let result = scanner.next_word(); | |
assert_eq!( | |
result, | |
Ok(Token { | |
span: Span::from_start_end(0, 2), | |
kind: () | |
}) | |
); | |
} | |
#[test] | |
fn roll_back_to_valid_token() { | |
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2); | |
let transitions = Table::new([ | |
(0, vec![1, 3, 3]), | |
(1, vec![3, 2, 3]), | |
(2, vec![3, 3, 3]), | |
(3, vec![3, 3, 3]), | |
]); | |
let accepting_states = Table::new([(2, ())]); | |
let mut scanner = Scanner::new("r1r1", classes, transitions, accepting_states); | |
let results = [scanner.next_word(), scanner.next_word()]; | |
assert_eq!( | |
results, | |
[ | |
Ok(Token { | |
span: Span::from_start_end(0, 2), | |
kind: () | |
}), | |
Ok(Token { | |
span: Span::from_start_end(2, 4), | |
kind: () | |
}) | |
] | |
); | |
} | |
#[test] | |
fn invalid_token() { | |
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2); | |
let transitions = Table::new([ | |
(0, vec![1, 3, 3]), | |
(1, vec![3, 2, 3]), | |
(2, vec![3, 3, 3]), | |
(3, vec![3, 3, 3]), | |
]); | |
let accepting_states = Table::new([(2, ())]); | |
let mut scanner = Scanner::new("rr", classes, transitions, accepting_states); | |
let result = scanner.next_word(); | |
assert_eq!(result, Err(Span::from_start_end(0, 2))); | |
} | |
#[test] | |
fn invalid_token_later_in_stream() { | |
let classes = DefaultTable::new([('r', 0), ('1', 1), ('2', 1)], 2); | |
let transitions = Table::new([ | |
(0, vec![1, 3, 3]), | |
(1, vec![3, 2, 3]), | |
(2, vec![3, 3, 3]), | |
(3, vec![3, 3, 3]), | |
]); | |
let accepting_states = Table::new([(2, ())]); | |
let mut scanner = Scanner::new("r1r2rr", classes, transitions, accepting_states); | |
let _ = scanner.next_word(); | |
let _ = scanner.next_word(); | |
let result = scanner.next_word(); | |
assert_eq!(result, Err(Span::from_start_end(4, 6))); | |
} | |
#[test] | |
fn multiple_accepting_tokens() { | |
let classes = DefaultTable::new([('f', 0), ('o', 1), ('b', 2), ('a', 3), ('r', 4)], 5); | |
let transitions = Table::new([ | |
(0, vec![1, 7, 4, 7, 7, 7]), // start | |
(1, vec![7, 2, 7, 7, 7, 7]), // f | |
(2, vec![7, 3, 7, 7, 7, 7]), // fo | |
(3, vec![7, 7, 7, 7, 7, 7]), // foo | |
(4, vec![7, 7, 7, 5, 7, 7]), // b | |
(5, vec![7, 7, 7, 7, 6, 7]), // ba | |
(6, vec![7, 7, 7, 7, 7, 7]), // bar | |
]); | |
let accepting_states = Table::new([(3, "FOO"), (6, "BAR")]); | |
let mut scanner = Scanner::new("foo", classes, transitions, accepting_states); | |
let result = scanner.next_word(); | |
assert_eq!( | |
result, | |
Ok(Token { | |
span: Span::from_start_end(0, 3), | |
kind: "FOO", | |
}) | |
); | |
let mut scanner = scanner.reuse("bar"); | |
let result = scanner.next_word(); | |
assert_eq!( | |
result, | |
Ok(Token { | |
span: Span::from_start_end(0, 3), | |
kind: "BAR", | |
}) | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment