Skip to content

Instantly share code, notes, and snippets.

@Nekrolm
Created December 21, 2024 22:41
Show Gist options
  • Save Nekrolm/d08566836b0e342bca39a218d60c2a58 to your computer and use it in GitHub Desktop.
Save Nekrolm/d08566836b0e342bca39a218d60c2a58 to your computer and use it in GitHub Desktop.
Rust Levenshtein otimizations?
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb cccccccccccccccccccccccccccccccccccccccccccccccccccc ddddddddddddddd eeeeeeeeeeeeeeeee ffffffffffffff ggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhh iiiiiiiiiiiiiiiiiiiiiiiiiiiii jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk llllllllllllllllllllllllllllllllllllllllllllll mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn ooooooooooooooooooooooooooooooooo pppppppppppppppppppppppppppppppppppppppppp qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq rrrrrrrrrrrrrrrrrrrrrrrrrrrrr ssssssssssssssssssssssssssssss tttttttttttttttttttttttttt uuuuuuuuuuuuuuuuuuuuuuuuu vvvvvvvvvvvvvvvvvvvvvvvvvvvvv wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm 1234567890 QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm1234567890 mnxbckfjquwhduywetdftfqrcfgzbxmcsdlsppweuouhftvjcsmnbsdvuqtfvcjbxmcnbkaoqwerpweiurigjhdfjbmsbjgqiuyeiruyoisbnsvuyfguweygrhoaiosjdkjhfbsjhdvfkqjnoi gaywgeudgaewytfvajsdvfjahsdvfjhsdfjhgsfjhqowiueryiuweytruiyytfyqvhgvzxncbsidfjdflsdfpwoeiriuwheuyfvydtvqyuygbhskdjfbkajbuywgduywegfvsnmwehriygwuerygwerdfjhsdfsdkfhbsjdhfjuavufywefhuwlihfoqwehqrwheiruyuyqgruwieqrhsdjhfbjwhvdfisdufhikquhweifgbwfisdhfksjdhfiwuh ASDFHWERFIWSHHSDFJWERPWFOAHDSDFXMCVNWMERKJQEOQIWRUQIHJSBFSDFKJSDFOWIERIOU werkjhiauwryisydgcfkjsdfkjsjdhfguiyqgtfsdfSDGFDSFGwsrhqwigdeyiwefDFBdkqedfgdFHGHKJiFHTYRGFsefjhwhsgdfjhhjsdfDWEfsdfWEFfbgFGjYuioIOpvbnVBNSDFadfsSDFwegsdgfAWFDFGDFghjTyIGHJREGFsddqwdsdfweaWQAZZSDGgnlpHmHJMgkOYUTDFGSFSDFSDFDHGDFGSDFDGRFjhbjshdgfjhgsdfSDGDFG kjsdfkhgwieyfguwsyefgsdfSDFGJSDAKFDSAFIRFYUDSFHSBVXCVBNMSDKFJWOYSDFHKADSDnfsdfjbjshdbfkwjbfksdbfhwbkyguygyfshbcdmnxbvcmnsdkfsdfsdflspflwoekorwueiruygwuygjshbvnbvzcjsuhfdiuhsdfghkwjhdfiwegfjdhsgfbksjhfksdhgfhsdhfghfgfgfsjdjfhkwjehoueuq abcdefghijklmnopqrstuvwxyz thistringisnottoolong howcanwemakeabetterstringforcomparing absfeiihdkkeiurmnslkspoiqwrjlnjna jsfjlqpowiuewriugbsdmfasgdfqsfwwiewurmxbcaslkdjp qwerdsdftffygfuhbuhiok sfhdgafsdhafsdjaweqwueytuqwyefgagsdgfasdfajfgsieufowerpoafposdfmnxbcvjlkhhgdfkjasfguysdfuayfgjhvf ygiuwoqupqowei sjxvkzjha hfgauyiuyrwer sdf dsfgsdfdfg dfgdswefdsdgd ffhfghsdfghsdfeuryiwuyeriudhfmnsbmnbvsgdfkuweifgwefgisdgf sdfskdfhgksugfweriuwersdkfjbsewoir sfdqrieyft sdfnueyrtuwyerowierpoipmvmcxnvmnbsdnfbajeygruywgefugsdbaqwriuweuywiuyer dsfjgfuywegfugsufgqfgdjsdhfjhsdgfhsgdfiweoruoweruwer sdfhyairuywoerfuetuopiufudsvxcbvmsdflksfwpeoriiueief dsfygweurytwueyr sdfshfgwegfisfisdgfowuyeridfs sdfhsdfgsudyfguwyef sdfjsgdfysgdfuwgueyfs dfsgdfusgdfiweoriuwqoiru ygdifuwuoerupweurpquerowehmnxcbvmnxbvjcquqyerypeiproweur fhsdgfygsdfuygsudfgufrpesifgsdfshdfbuwyveugfud fefgiuyegsiufygdfigreoquyarcfdbnmnsbfdlsdfoiweprisdfqbyefg sdfugwuefgusdygfuygwdufytwuqytueryt efsgdvfuysgfdusydfuywguyeryoqurwreiueieieiiii
#![no_main]
/// Calculates the Levenshtein distance between two strings using Wagner-Fischer algorithm
/// Space Complexity: O(min(m,n)) - only uses two rows instead of full matrix
/// Time Complexity: O(m*n) where m and n are the lengths of the input strings
fn levenshtein_distance(s1: &[u8], s2: &[u8], buffer: &mut Vec<usize>) -> usize {
let (s1, s2) = if s1.len() > s2.len() {
(s2, s1)
} else {
(s1, s2)
};
buffer.resize(2 * s1.len() + 2, 0);
let (prev_row, curr_row) = unsafe { buffer.split_at_mut_unchecked(s1.len() + 1) };
for (i, x) in prev_row.iter_mut().enumerate() {
*x = i;
}
s2.iter()
.enumerate()
.fold(
(0, curr_row, prev_row),
|(_, curr_row, prev_row), (j, &s2_c)| {
let result = {
let (init, curr_row) = unsafe { curr_row.split_first_mut().unwrap_unchecked() };
*init = j + 1;
let prevs = prev_row.windows(2);
prevs
.enumerate()
.fold(*init, |curr_row_i, (i, prevs)| {
let prev_row_i = unsafe { *prevs.get_unchecked(0) };
let prev_row_i_1 = unsafe { *prevs.get_unchecked(1) };
let s1_c = unsafe { *s1.get_unchecked(i) };
let cost = (s1_c != s2_c) as usize;
let curr_row_i_1 = (curr_row_i + 1)
.min(prev_row_i_1 + 1)
.min(prev_row_i + cost);
unsafe {
*curr_row.get_unchecked_mut(i) = curr_row_i_1;
}
curr_row_i_1
})
};
(result, prev_row, curr_row)
},
)
.0
}
#[unsafe(no_mangle)]
extern "C" fn main(argc: i32, args: *const *const i8) -> i32 {
let timer = std::time::Instant::now();
let Some((_, args)) = unsafe { std::slice::from_raw_parts(args, argc as usize) }.split_first()
else {
return 0;
};
if args.len() < 2 {
println!("Please provide at least two strings as arguments.");
return 0;
}
let mut min_distance = None;
let mut times = 0;
let args: Vec<&[u8]> = args
.into_iter()
.map(|&s| unsafe { std::ffi::CStr::from_ptr(s) }.to_bytes())
.collect();
let mut buffer = Vec::with_capacity(256);
// Compare all pairs of strings
for i in 0..args.len() {
for j in 0..args.len() {
if i != j {
let distance = levenshtein_distance(args[i], args[j], &mut buffer);
if let Some(current_min) = min_distance {
if distance < current_min {
min_distance = Some(distance);
}
} else {
min_distance = Some(distance);
}
times += 1;
}
}
}
println!("elapsed: {}ms", timer.elapsed().as_millis());
println!("times: {}", times);
println!("min_distance: {}", min_distance.unwrap_or(usize::MAX));
0
}
// Original code
// use std::env;
// /// Calculates the Levenshtein distance between two strings using Wagner-Fischer algorithm
// /// Space Complexity: O(min(m,n)) - only uses two rows instead of full matrix
// /// Time Complexity: O(m*n) where m and n are the lengths of the input strings
// fn levenshtein_distance(s1: &str, s2: &str) -> usize {
// // Early termination checks
// if s1 == s2 {
// return 0;
// }
// if s1.is_empty() {
// return s2.len();
// }
// if s2.is_empty() {
// return s1.len();
// }
// // Convert to bytes for faster access
// let s1_bytes = s1.as_bytes();
// let s2_bytes = s2.as_bytes();
// // Make s1 the shorter string for space optimization
// let (s1_bytes, s2_bytes) = if s1_bytes.len() > s2_bytes.len() {
// (s2_bytes, s1_bytes)
// } else {
// (s1_bytes, s2_bytes)
// };
// let m = s1_bytes.len();
// let n = s2_bytes.len();
// // Use two rows instead of full matrix for space optimization
// let mut prev_row = Vec::with_capacity(m + 1);
// let mut curr_row = Vec::with_capacity(m + 1);
// // Initialize first row
// prev_row.extend(0..=m);
// curr_row.resize(m + 1, 0);
// // Main computation loop
// for j in 1..=n {
// curr_row[0] = j;
// for i in 1..=m {
// let cost = if s1_bytes[i - 1] == s2_bytes[j - 1] { 0 } else { 1 };
// // Calculate minimum of three operations
// curr_row[i] = std::cmp::min(
// std::cmp::min(
// prev_row[i] + 1, // deletion
// curr_row[i - 1] + 1, // insertion
// ),
// prev_row[i - 1] + cost // substitution
// );
// }
// // Swap rows
// std::mem::swap(&mut prev_row, &mut curr_row);
// }
// prev_row[m]
// }
// fn main() {
// let timer = std::time::Instant::now();
// let args: Vec<String> = env::args().skip(1).collect();
// if args.len() < 2 {
// println!("Please provide at least two strings as arguments.");
// return;
// }
// let mut min_distance = None;
// let mut times = 0;
// // Compare all pairs of strings
// for i in 0..args.len() {
// for j in 0..args.len() {
// if i != j {
// let distance = levenshtein_distance(&args[i], &args[j]);
// if let Some(current_min) = min_distance {
// if distance < current_min {
// min_distance = Some(distance);
// }
// } else {
// min_distance = Some(distance);
// }
// times += 1;
// }
// }
// }
// println!("elapsed: {}ms", timer.elapsed().as_millis());
// println!("times: {}", times);
// println!("min_distance: {}", min_distance.unwrap_or(usize::MAX));
// }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment