Last active
February 15, 2022 22:46
-
-
Save worldOneo/838a56c70840f9cdfaacaee416805477 to your computer and use it in GitHub Desktop.
FuzzySearch engine using BK-Tree written in Rust.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::fmt::Debug; | |
use std::io::{stdout, Write}; | |
use std::{collections::HashMap, env, fs, io, time::Instant}; | |
#[derive(Debug)] | |
struct FuzzyNode<T> | |
where | |
T: Clone, | |
{ | |
value: Option<(String, T)>, | |
children: HashMap<i64, FuzzyNode<T>>, | |
} | |
impl<T: Clone> FuzzyNode<T> { | |
pub fn new() -> FuzzyNode<T> { | |
FuzzyNode { | |
value: None, | |
children: HashMap::new(), | |
} | |
} | |
pub fn insert(&mut self, key: String, item: T) { | |
match &self.value { | |
Some((val, _)) => { | |
let dist = strsim::levenshtein(val, &key); | |
if dist == 0 { | |
return; | |
}; | |
self.insert_dist(key, item, dist as i64) | |
} | |
None => self.value = Some((key, item)), | |
} | |
} | |
pub fn search(&self, query: &String, results: &mut Vec<T>, max_results: i64, span: i64) { | |
let mut new_len = results.len() as i64; | |
let mut depth = 0; | |
let mut exhausted = false; | |
while new_len < max_results && new_len < max_results && !exhausted { | |
exhausted = self.search_depth(query, results, max_results, depth, span); | |
depth += 1; | |
new_len = results.len() as i64; | |
} | |
} | |
fn search_depth( | |
&self, | |
query: &String, | |
results: &mut Vec<T>, | |
max_results: i64, | |
depth: i64, | |
span: i64, | |
) -> bool { | |
match &self.value { | |
None => panic!("Search in invalid state"), | |
Some((value, item)) => { | |
let dist = strsim::levenshtein(value, query) as i64; | |
if depth == 0 { | |
if dist <= span { | |
results.push(item.clone()); | |
} | |
return false; | |
} | |
let mut scratched = 0; | |
let mut diff = 0; | |
loop { | |
let i = dist + diff; | |
if results.len() as i64 >= max_results { | |
return true; | |
} | |
let child = self.children.get(&i); | |
if child.is_some() { | |
if child | |
.unwrap() | |
.search_depth(query, results, max_results, depth - 1, span) | |
{ | |
scratched += 1; | |
} | |
} else { | |
scratched += 1; | |
} | |
if diff < 0 { | |
diff = -diff + 1; | |
} else if diff == 0 { | |
diff = 1; | |
} else { | |
diff = -diff; | |
} | |
if diff > span { | |
break; | |
} | |
} | |
return scratched >= span * 2; | |
} | |
} | |
} | |
fn insert_dist(&mut self, key: String, item: T, dist: i64) { | |
let entry = self.children.entry(dist); | |
let child = entry.or_insert_with(|| FuzzyNode::new()); | |
child.insert(key, item) | |
} | |
} | |
fn main() { | |
let mut args = env::args(); | |
if args.len() < 2 { | |
print!("File to index must be entered"); | |
return; | |
} | |
let mut tree = FuzzyNode::new(); | |
let mut now = Instant::now(); | |
let file = fs::read_to_string(args.nth(1).unwrap()).expect("Couldn't open file"); | |
let lines: Vec<&str> = file.split("\n").map(|s| s.trim()).collect(); | |
let line_len = lines.len(); | |
for (i, line) in lines.iter().enumerate() { | |
if i % 10_000 == 0 { | |
println!("{} = {}", i, line); | |
} | |
tree.insert(line.to_string(), line); | |
} | |
let milllis = now.elapsed().as_millis(); | |
println!( | |
"Indexed {} items in {}ms ({}/ms)", | |
line_len, | |
milllis, | |
line_len as f64 / milllis as f64 | |
); | |
loop { | |
print!("> "); | |
stdout().flush().unwrap(); | |
let mut query = String::new(); | |
io::stdin().read_line(&mut query).unwrap(); | |
let mut results = vec![]; | |
now = Instant::now(); | |
tree.search(&query.trim().to_string(), &mut results, 20, 2); | |
println!( | |
"Found {} results in {}ms", | |
results.len(), | |
now.elapsed().as_micros() as f64 / 1000.0 | |
); | |
for (i, res) in results.iter().enumerate() { | |
println!("{i:0>2}) {res}", i = &i, res = res) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment