Created
December 21, 2024 22:41
-
-
Save Nekrolm/d08566836b0e342bca39a218d60c2a58 to your computer and use it in GitHub Desktop.
Rust Levenshtein otimizations?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb cccccccccccccccccccccccccccccccccccccccccccccccccccc ddddddddddddddd eeeeeeeeeeeeeeeee ffffffffffffff ggggggggggg hhhhhhhhhhhhhhhhhhhhhhhhhhhhh iiiiiiiiiiiiiiiiiiiiiiiiiiiii jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk llllllllllllllllllllllllllllllllllllllllllllll mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn ooooooooooooooooooooooooooooooooo pppppppppppppppppppppppppppppppppppppppppp qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq rrrrrrrrrrrrrrrrrrrrrrrrrrrrr ssssssssssssssssssssssssssssss tttttttttttttttttttttttttt uuuuuuuuuuuuuuuuuuuuuuuuu vvvvvvvvvvvvvvvvvvvvvvvvvvvvv wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm 1234567890 QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm1234567890 mnxbckfjquwhduywetdftfqrcfgzbxmcsdlsppweuouhftvjcsmnbsdvuqtfvcjbxmcnbkaoqwerpweiurigjhdfjbmsbjgqiuyeiruyoisbnsvuyfguweygrhoaiosjdkjhfbsjhdvfkqjnoi gaywgeudgaewytfvajsdvfjahsdvfjhsdfjhgsfjhqowiueryiuweytruiyytfyqvhgvzxncbsidfjdflsdfpwoeiriuwheuyfvydtvqyuygbhskdjfbkajbuywgduywegfvsnmwehriygwuerygwerdfjhsdfsdkfhbsjdhfjuavufywefhuwlihfoqwehqrwheiruyuyqgruwieqrhsdjhfbjwhvdfisdufhikquhweifgbwfisdhfksjdhfiwuh ASDFHWERFIWSHHSDFJWERPWFOAHDSDFXMCVNWMERKJQEOQIWRUQIHJSBFSDFKJSDFOWIERIOU werkjhiauwryisydgcfkjsdfkjsjdhfguiyqgtfsdfSDGFDSFGwsrhqwigdeyiwefDFBdkqedfgdFHGHKJiFHTYRGFsefjhwhsgdfjhhjsdfDWEfsdfWEFfbgFGjYuioIOpvbnVBNSDFadfsSDFwegsdgfAWFDFGDFghjTyIGHJREGFsddqwdsdfweaWQAZZSDGgnlpHmHJMgkOYUTDFGSFSDFSDFDHGDFGSDFDGRFjhbjshdgfjhgsdfSDGDFG kjsdfkhgwieyfguwsyefgsdfSDFGJSDAKFDSAFIRFYUDSFHSBVXCVBNMSDKFJWOYSDFHKADSDnfsdfjbjshdbfkwjbfksdbfhwbkyguygyfshbcdmnxbvcmnsdkfsdfsdflspflwoekorwueiruygwuygjshbvnbvzcjsuhfdiuhsdfghkwjhdfiwegfjdhsgfbksjhfksdhgfhsdhfghfgfgfsjdjfhkwjehoueuq abcdefghijklmnopqrstuvwxyz thistringisnottoolong howcanwemakeabetterstringforcomparing absfeiihdkkeiurmnslkspoiqwrjlnjna jsfjlqpowiuewriugbsdmfasgdfqsfwwiewurmxbcaslkdjp qwerdsdftffygfuhbuhiok sfhdgafsdhafsdjaweqwueytuqwyefgagsdgfasdfajfgsieufowerpoafposdfmnxbcvjlkhhgdfkjasfguysdfuayfgjhvf ygiuwoqupqowei sjxvkzjha hfgauyiuyrwer sdf dsfgsdfdfg dfgdswefdsdgd ffhfghsdfghsdfeuryiwuyeriudhfmnsbmnbvsgdfkuweifgwefgisdgf sdfskdfhgksugfweriuwersdkfjbsewoir sfdqrieyft sdfnueyrtuwyerowierpoipmvmcxnvmnbsdnfbajeygruywgefugsdbaqwriuweuywiuyer dsfjgfuywegfugsufgqfgdjsdhfjhsdgfhsgdfiweoruoweruwer sdfhyairuywoerfuetuopiufudsvxcbvmsdflksfwpeoriiueief dsfygweurytwueyr sdfshfgwegfisfisdgfowuyeridfs sdfhsdfgsudyfguwyef sdfjsgdfysgdfuwgueyfs dfsgdfusgdfiweoriuwqoiru ygdifuwuoerupweurpquerowehmnxcbvmnxbvjcquqyerypeiproweur fhsdgfygsdfuygsudfgufrpesifgsdfshdfbuwyveugfud fefgiuyegsiufygdfigreoquyarcfdbnmnsbfdlsdfoiweprisdfqbyefg sdfugwuefgusdygfuygwdufytwuqytueryt efsgdvfuysgfdusydfuywguyeryoqurwreiueieieiiii |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![no_main] | |
/// Calculates the Levenshtein distance between two strings using Wagner-Fischer algorithm | |
/// Space Complexity: O(min(m,n)) - only uses two rows instead of full matrix | |
/// Time Complexity: O(m*n) where m and n are the lengths of the input strings | |
fn levenshtein_distance(s1: &[u8], s2: &[u8], buffer: &mut Vec<usize>) -> usize { | |
let (s1, s2) = if s1.len() > s2.len() { | |
(s2, s1) | |
} else { | |
(s1, s2) | |
}; | |
buffer.resize(2 * s1.len() + 2, 0); | |
let (prev_row, curr_row) = unsafe { buffer.split_at_mut_unchecked(s1.len() + 1) }; | |
for (i, x) in prev_row.iter_mut().enumerate() { | |
*x = i; | |
} | |
s2.iter() | |
.enumerate() | |
.fold( | |
(0, curr_row, prev_row), | |
|(_, curr_row, prev_row), (j, &s2_c)| { | |
let result = { | |
let (init, curr_row) = unsafe { curr_row.split_first_mut().unwrap_unchecked() }; | |
*init = j + 1; | |
let prevs = prev_row.windows(2); | |
prevs | |
.enumerate() | |
.fold(*init, |curr_row_i, (i, prevs)| { | |
let prev_row_i = unsafe { *prevs.get_unchecked(0) }; | |
let prev_row_i_1 = unsafe { *prevs.get_unchecked(1) }; | |
let s1_c = unsafe { *s1.get_unchecked(i) }; | |
let cost = (s1_c != s2_c) as usize; | |
let curr_row_i_1 = (curr_row_i + 1) | |
.min(prev_row_i_1 + 1) | |
.min(prev_row_i + cost); | |
unsafe { | |
*curr_row.get_unchecked_mut(i) = curr_row_i_1; | |
} | |
curr_row_i_1 | |
}) | |
}; | |
(result, prev_row, curr_row) | |
}, | |
) | |
.0 | |
} | |
#[unsafe(no_mangle)] | |
extern "C" fn main(argc: i32, args: *const *const i8) -> i32 { | |
let timer = std::time::Instant::now(); | |
let Some((_, args)) = unsafe { std::slice::from_raw_parts(args, argc as usize) }.split_first() | |
else { | |
return 0; | |
}; | |
if args.len() < 2 { | |
println!("Please provide at least two strings as arguments."); | |
return 0; | |
} | |
let mut min_distance = None; | |
let mut times = 0; | |
let args: Vec<&[u8]> = args | |
.into_iter() | |
.map(|&s| unsafe { std::ffi::CStr::from_ptr(s) }.to_bytes()) | |
.collect(); | |
let mut buffer = Vec::with_capacity(256); | |
// Compare all pairs of strings | |
for i in 0..args.len() { | |
for j in 0..args.len() { | |
if i != j { | |
let distance = levenshtein_distance(args[i], args[j], &mut buffer); | |
if let Some(current_min) = min_distance { | |
if distance < current_min { | |
min_distance = Some(distance); | |
} | |
} else { | |
min_distance = Some(distance); | |
} | |
times += 1; | |
} | |
} | |
} | |
println!("elapsed: {}ms", timer.elapsed().as_millis()); | |
println!("times: {}", times); | |
println!("min_distance: {}", min_distance.unwrap_or(usize::MAX)); | |
0 | |
} | |
// Original code | |
// use std::env; | |
// /// Calculates the Levenshtein distance between two strings using Wagner-Fischer algorithm | |
// /// Space Complexity: O(min(m,n)) - only uses two rows instead of full matrix | |
// /// Time Complexity: O(m*n) where m and n are the lengths of the input strings | |
// fn levenshtein_distance(s1: &str, s2: &str) -> usize { | |
// // Early termination checks | |
// if s1 == s2 { | |
// return 0; | |
// } | |
// if s1.is_empty() { | |
// return s2.len(); | |
// } | |
// if s2.is_empty() { | |
// return s1.len(); | |
// } | |
// // Convert to bytes for faster access | |
// let s1_bytes = s1.as_bytes(); | |
// let s2_bytes = s2.as_bytes(); | |
// // Make s1 the shorter string for space optimization | |
// let (s1_bytes, s2_bytes) = if s1_bytes.len() > s2_bytes.len() { | |
// (s2_bytes, s1_bytes) | |
// } else { | |
// (s1_bytes, s2_bytes) | |
// }; | |
// let m = s1_bytes.len(); | |
// let n = s2_bytes.len(); | |
// // Use two rows instead of full matrix for space optimization | |
// let mut prev_row = Vec::with_capacity(m + 1); | |
// let mut curr_row = Vec::with_capacity(m + 1); | |
// // Initialize first row | |
// prev_row.extend(0..=m); | |
// curr_row.resize(m + 1, 0); | |
// // Main computation loop | |
// for j in 1..=n { | |
// curr_row[0] = j; | |
// for i in 1..=m { | |
// let cost = if s1_bytes[i - 1] == s2_bytes[j - 1] { 0 } else { 1 }; | |
// // Calculate minimum of three operations | |
// curr_row[i] = std::cmp::min( | |
// std::cmp::min( | |
// prev_row[i] + 1, // deletion | |
// curr_row[i - 1] + 1, // insertion | |
// ), | |
// prev_row[i - 1] + cost // substitution | |
// ); | |
// } | |
// // Swap rows | |
// std::mem::swap(&mut prev_row, &mut curr_row); | |
// } | |
// prev_row[m] | |
// } | |
// fn main() { | |
// let timer = std::time::Instant::now(); | |
// let args: Vec<String> = env::args().skip(1).collect(); | |
// if args.len() < 2 { | |
// println!("Please provide at least two strings as arguments."); | |
// return; | |
// } | |
// let mut min_distance = None; | |
// let mut times = 0; | |
// // Compare all pairs of strings | |
// for i in 0..args.len() { | |
// for j in 0..args.len() { | |
// if i != j { | |
// let distance = levenshtein_distance(&args[i], &args[j]); | |
// if let Some(current_min) = min_distance { | |
// if distance < current_min { | |
// min_distance = Some(distance); | |
// } | |
// } else { | |
// min_distance = Some(distance); | |
// } | |
// times += 1; | |
// } | |
// } | |
// } | |
// println!("elapsed: {}ms", timer.elapsed().as_millis()); | |
// println!("times: {}", times); | |
// println!("min_distance: {}", min_distance.unwrap_or(usize::MAX)); | |
// } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment