Last active
August 12, 2023 18:20
-
-
Save matthewjberger/830fb8540bc3e74286fb673b550f8b58 to your computer and use it in GitHub Desktop.
Fuzzy search string lists in rust using Levehnstein distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fn levenshtein(a: &str, b: &str) -> usize { | |
let (a_len, b_len) = (a.chars().count(), b.chars().count()); | |
let mut matrix = vec![vec![0; b_len + 1]; a_len + 1]; | |
for i in 0..=a_len { | |
matrix[i][0] = i; | |
} | |
for j in 0..=b_len { | |
matrix[0][j] = j; | |
} | |
for (i, ca) in a.chars().enumerate() { | |
for (j, cb) in b.chars().enumerate() { | |
let cost = if ca == cb { 0 } else { 1 }; | |
let delete = matrix[i][j + 1] + 1; | |
let insert = matrix[i + 1][j] + 1; | |
let substitute = matrix[i][j] + cost; | |
matrix[i + 1][j + 1] = std::cmp::min(delete, std::cmp::min(insert, substitute)); | |
} | |
} | |
matrix[a_len][b_len] | |
} | |
fn fuzzy_search<'a>(query: &str, data: &'a [&'a str], max_distance: usize) -> Vec<&'a str> { | |
data.iter() | |
.filter(|&&candidate| levenshtein(query, candidate) <= max_distance) | |
.cloned() | |
.collect() | |
} | |
#[test] | |
fn test_fuzzy_search() { | |
let data = [ | |
"apple", | |
"banana", | |
"cherry", | |
"date", | |
"grape", | |
"blueberry", | |
"apricot", | |
]; | |
let query = "aple"; | |
let results = fuzzy_search(query, &data, 1); | |
assert_eq!(results, vec!["apple"]); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment