Skip to content

Instantly share code, notes, and snippets.

@mbillingr
Created October 27, 2022 14:17
Show Gist options
  • Save mbillingr/3964d8f80d3a4311f443b125794353f8 to your computer and use it in GitHub Desktop.
Save mbillingr/3964d8f80d3a4311f443b125794353f8 to your computer and use it in GitHub Desktop.
Table based scanner
use std::collections::HashMap;
use std::hash::Hash;
pub type Result<T> = std::result::Result<T, Span>;
pub struct Scanner<'i, T: Clone> {
classifier_table: DefaultTable<char, usize>,
transition_table: Table<usize, Vec<usize>>,
accepting_states: Table<usize, T>,
input: &'i str,
cursor: usize,
}
impl<'i, T: Clone> Scanner<'i, T> {
pub fn new(
input: &'i str,
classifier_table: DefaultTable<char, usize>,
transition_table: Table<usize, Vec<usize>>,
accepting_states: Table<usize, T>,
) -> Self {
assert!(transition_table.contains(0));
Scanner {
classifier_table,
transition_table,
accepting_states,
input,
cursor: 0,
}
}
pub fn reuse<'a>(self, input: &'a str) -> Scanner<'a, T> {
Scanner { input, cursor: 0, ..self }
}
pub fn next_word(&mut self) -> Result<Token<T>> {
let start = self.cursor;
let mut state = 0;
let mut stack = vec![];
let end_state = self.last_state();
while state != end_state {
let ch = match self.next_char() {
Some(ch) => ch,
None => break,
};
if self.accepting_states.contains(state) {
stack.clear();
}
stack.push(state);
print!("{state}");
let cls = self.classifier_table.lookup(ch);
state = self.transition_table.lookup(state)[*cls];
println!(" ==({ch})==> {state}");
}
let final_pos = self.cursor;
while !self.accepting_states.contains(state) {
print!("{state}");
state = stack
.pop()
.ok_or(Span::from_start_end(self.cursor, final_pos))?;
println!(" ==xxx==> {state}");
self.rollback();
}
Ok(Token {
span: Span::from_start_end(start, self.cursor),
kind: self.accepting_states.lookup(state).clone(),
})
}
fn next_char(&mut self) -> Option<char> {
let ch = self.input[self.cursor..].chars().next()?;
self.cursor += ch.len_utf8();
Some(ch)
}
fn rollback(&mut self) {
let (idx, _) = self.input[..self.cursor].char_indices().last().unwrap();
self.cursor = idx;
}
fn last_state(&self) -> usize {
*self.transition_table.keys().max().unwrap()
}
}
#[derive(Debug, PartialEq)]
pub struct Token<T> {
pub span: Span,
pub kind: T,
}
#[derive(Debug, PartialEq)]
pub struct Span {
start: usize,
end: usize,
}
impl Span {
pub fn from_start_end(start: usize, end: usize) -> Self {
Span { start, end }
}
}
pub struct Table<K, V>(std::collections::HashMap<K, V>);
impl<K: Eq + Hash, V> Table<K, V> {
pub fn new(entries: impl IntoIterator<Item = (K, V)>) -> Self {
Table(entries.into_iter().collect())
}
pub fn lookup(&self, key: K) -> &V {
&self.0[&key]
}
pub fn contains(&self, key: K) -> bool {
self.0.contains_key(&key)
}
pub fn keys(&self) -> impl Iterator<Item = &K> {
self.0.keys()
}
}
pub struct DefaultTable<K, V>(std::collections::HashMap<K, V>, V);
impl<K: Eq + Hash, V> DefaultTable<K, V> {
pub fn new(entries: impl IntoIterator<Item = (K, V)>, other: V) -> Self {
DefaultTable(entries.into_iter().collect(), other)
}
pub fn lookup(&self, key: K) -> &V {
self.0.get(&key).unwrap_or(&self.1)
}
pub fn contains(&self, key: K) -> bool {
self.0.contains_key(&key)
}
}
#[test]
fn degenerate_scanner() {
let classes = DefaultTable::new([], 0);
let transitions = Table::new([(0, vec![0])]);
let accepting_states = Table::new([(0, ())]);
let mut scanner = Scanner::new("", classes, transitions, accepting_states);
let result = scanner.next_word();
assert_eq!(
result,
Ok(Token {
span: Span::from_start_end(0, 0),
kind: ()
})
);
}
#[test]
fn simple_scan() {
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2);
let transitions = Table::new([
(0, vec![1, 3, 3]),
(1, vec![3, 2, 3]),
(2, vec![3, 3, 3]),
(3, vec![3, 3, 3]),
]);
let accepting_states = Table::new([(2, ())]);
let mut scanner = Scanner::new("r1", classes, transitions, accepting_states);
let result = scanner.next_word();
assert_eq!(
result,
Ok(Token {
span: Span::from_start_end(0, 2),
kind: ()
})
);
}
#[test]
fn roll_back_to_valid_token() {
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2);
let transitions = Table::new([
(0, vec![1, 3, 3]),
(1, vec![3, 2, 3]),
(2, vec![3, 3, 3]),
(3, vec![3, 3, 3]),
]);
let accepting_states = Table::new([(2, ())]);
let mut scanner = Scanner::new("r1r1", classes, transitions, accepting_states);
let results = [scanner.next_word(), scanner.next_word()];
assert_eq!(
results,
[
Ok(Token {
span: Span::from_start_end(0, 2),
kind: ()
}),
Ok(Token {
span: Span::from_start_end(2, 4),
kind: ()
})
]
);
}
#[test]
fn invalid_token() {
let classes = DefaultTable::new([('r', 0), ('1', 1)], 2);
let transitions = Table::new([
(0, vec![1, 3, 3]),
(1, vec![3, 2, 3]),
(2, vec![3, 3, 3]),
(3, vec![3, 3, 3]),
]);
let accepting_states = Table::new([(2, ())]);
let mut scanner = Scanner::new("rr", classes, transitions, accepting_states);
let result = scanner.next_word();
assert_eq!(result, Err(Span::from_start_end(0, 2)));
}
#[test]
fn invalid_token_later_in_stream() {
let classes = DefaultTable::new([('r', 0), ('1', 1), ('2', 1)], 2);
let transitions = Table::new([
(0, vec![1, 3, 3]),
(1, vec![3, 2, 3]),
(2, vec![3, 3, 3]),
(3, vec![3, 3, 3]),
]);
let accepting_states = Table::new([(2, ())]);
let mut scanner = Scanner::new("r1r2rr", classes, transitions, accepting_states);
let _ = scanner.next_word();
let _ = scanner.next_word();
let result = scanner.next_word();
assert_eq!(result, Err(Span::from_start_end(4, 6)));
}
#[test]
fn multiple_accepting_tokens() {
let classes = DefaultTable::new([('f', 0), ('o', 1), ('b', 2), ('a', 3), ('r', 4)], 5);
let transitions = Table::new([
(0, vec![1, 7, 4, 7, 7, 7]), // start
(1, vec![7, 2, 7, 7, 7, 7]), // f
(2, vec![7, 3, 7, 7, 7, 7]), // fo
(3, vec![7, 7, 7, 7, 7, 7]), // foo
(4, vec![7, 7, 7, 5, 7, 7]), // b
(5, vec![7, 7, 7, 7, 6, 7]), // ba
(6, vec![7, 7, 7, 7, 7, 7]), // bar
]);
let accepting_states = Table::new([(3, "FOO"), (6, "BAR")]);
let mut scanner = Scanner::new("foo", classes, transitions, accepting_states);
let result = scanner.next_word();
assert_eq!(
result,
Ok(Token {
span: Span::from_start_end(0, 3),
kind: "FOO",
})
);
let mut scanner = scanner.reuse("bar");
let result = scanner.next_word();
assert_eq!(
result,
Ok(Token {
span: Span::from_start_end(0, 3),
kind: "BAR",
})
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment