Skip to content

Instantly share code, notes, and snippets.

@worldOneo
Last active February 15, 2022 22:46
Show Gist options
  • Save worldOneo/838a56c70840f9cdfaacaee416805477 to your computer and use it in GitHub Desktop.
Save worldOneo/838a56c70840f9cdfaacaee416805477 to your computer and use it in GitHub Desktop.
FuzzySearch engine using BK-Tree written in Rust.
use std::fmt::Debug;
use std::io::{stdout, Write};
use std::{collections::HashMap, env, fs, io, time::Instant};
#[derive(Debug)]
struct FuzzyNode<T>
where
T: Clone,
{
value: Option<(String, T)>,
children: HashMap<i64, FuzzyNode<T>>,
}
impl<T: Clone> FuzzyNode<T> {
pub fn new() -> FuzzyNode<T> {
FuzzyNode {
value: None,
children: HashMap::new(),
}
}
pub fn insert(&mut self, key: String, item: T) {
match &self.value {
Some((val, _)) => {
let dist = strsim::levenshtein(val, &key);
if dist == 0 {
return;
};
self.insert_dist(key, item, dist as i64)
}
None => self.value = Some((key, item)),
}
}
pub fn search(&self, query: &String, results: &mut Vec<T>, max_results: i64, span: i64) {
let mut new_len = results.len() as i64;
let mut depth = 0;
let mut exhausted = false;
while new_len < max_results && new_len < max_results && !exhausted {
exhausted = self.search_depth(query, results, max_results, depth, span);
depth += 1;
new_len = results.len() as i64;
}
}
fn search_depth(
&self,
query: &String,
results: &mut Vec<T>,
max_results: i64,
depth: i64,
span: i64,
) -> bool {
match &self.value {
None => panic!("Search in invalid state"),
Some((value, item)) => {
let dist = strsim::levenshtein(value, query) as i64;
if depth == 0 {
if dist <= span {
results.push(item.clone());
}
return false;
}
let mut scratched = 0;
let mut diff = 0;
loop {
let i = dist + diff;
if results.len() as i64 >= max_results {
return true;
}
let child = self.children.get(&i);
if child.is_some() {
if child
.unwrap()
.search_depth(query, results, max_results, depth - 1, span)
{
scratched += 1;
}
} else {
scratched += 1;
}
if diff < 0 {
diff = -diff + 1;
} else if diff == 0 {
diff = 1;
} else {
diff = -diff;
}
if diff > span {
break;
}
}
return scratched >= span * 2;
}
}
}
fn insert_dist(&mut self, key: String, item: T, dist: i64) {
let entry = self.children.entry(dist);
let child = entry.or_insert_with(|| FuzzyNode::new());
child.insert(key, item)
}
}
fn main() {
let mut args = env::args();
if args.len() < 2 {
print!("File to index must be entered");
return;
}
let mut tree = FuzzyNode::new();
let mut now = Instant::now();
let file = fs::read_to_string(args.nth(1).unwrap()).expect("Couldn't open file");
let lines: Vec<&str> = file.split("\n").map(|s| s.trim()).collect();
let line_len = lines.len();
for (i, line) in lines.iter().enumerate() {
if i % 10_000 == 0 {
println!("{} = {}", i, line);
}
tree.insert(line.to_string(), line);
}
let milllis = now.elapsed().as_millis();
println!(
"Indexed {} items in {}ms ({}/ms)",
line_len,
milllis,
line_len as f64 / milllis as f64
);
loop {
print!("> ");
stdout().flush().unwrap();
let mut query = String::new();
io::stdin().read_line(&mut query).unwrap();
let mut results = vec![];
now = Instant::now();
tree.search(&query.trim().to_string(), &mut results, 20, 2);
println!(
"Found {} results in {}ms",
results.len(),
now.elapsed().as_micros() as f64 / 1000.0
);
for (i, res) in results.iter().enumerate() {
println!("{i:0>2}) {res}", i = &i, res = res)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment