Created
May 9, 2019 15:00
-
-
Save jbowles/c0f6de86003e7ad5ad97ecb352ee04fe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
RatcliffObserhelp distance | |
SEE tokenizer | |
[NIST Ratcliff/Obershelp pattern recognition](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html): | |
Compute the similarity of two strings as the number of matching characters divided by the total number of characters in the two strings. | |
Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence. | |
*/ | |
/* | |
// use this to replace longest_common_substring | |
// Return start of commn substring in s1, start of common substring in s2, and length of substring | |
// Indexes refer to character number, not index (differ for Unicode strings) | |
fn longest_common_substring() -> (usize, usize, usize) { | |
//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L3 | |
(0, 0, 0) | |
} | |
*/ | |
fn longest_common_substring<'a>( | |
shorter: &'a str, | |
longer: &'a str, | |
low1: usize, | |
high1: usize, | |
low2: usize, | |
high2: usize, | |
) -> (usize, usize, usize) { | |
let longsub = &longer[low2..high2]; | |
let slen = high1 - low1; | |
for size in (1..=slen + 1).rev() { | |
for start in 0..=slen - size { | |
let substr = &shorter[low1 + start..low1 + start + size]; | |
let matches: Vec<(usize, &'a str)> = longsub.match_indices(substr).collect(); | |
if let Some(&(startb, matchstr)) = matches.first() { | |
return (low1 + start, low2 + startb, matchstr.len()); | |
} | |
} | |
} | |
(low1, low2, 0) | |
} | |
/* | |
// use this to replace matching_blocks | |
fn matching_blocks() -> Vec<(usize, usize, usize)> { | |
//https://github.com/matthieugomez/StringDistances.jl/blob/2834265e96f18993a98a57d97e9f27a450a161d1/src/distances/RatcliffObershelp.jl#L31 | |
vec![(0, 0, 0)] | |
} | |
*/ | |
pub fn matching_blocks<'a>(shorter: &'a str, longer: &'a str) -> Vec<(usize, usize, usize)> { | |
let (len1, len2) = (shorter.len(), longer.len()); | |
let mut queue: Vec<(usize, usize, usize, usize)> = vec![(0, len1, 0, len2)]; | |
let mut matching_blocks = Vec::new(); | |
while let Some((low1, high1, low2, high2)) = queue.pop() { | |
let (i, j, k) = longest_common_substring(shorter, longer, low1, high1, low2, high2); | |
if k != 0 { | |
matching_blocks.push((i, j, k)); | |
if low1 < i && low2 < j { | |
queue.push((low1, i, low2, j)); | |
} | |
if i + k < high1 && j + k < high2 { | |
queue.push((i + k, high1, j + k, high2)); | |
} | |
} | |
} | |
//matching_blocks.sort(); // Is this necessary? | |
let (mut i1, mut j1, mut k1) = (0, 0, 0); | |
let mut non_adjacent = Vec::new(); | |
for (i2, j2, k2) in matching_blocks { | |
if i1 + k1 == i2 && j1 + k1 == j2 { | |
k1 += k2; | |
} else { | |
if k1 != 0 { | |
non_adjacent.push((i1, j1, k1)); | |
} | |
i1 = i2; | |
j1 = j2; | |
k1 = k2; | |
} | |
} | |
if k1 != 0 { | |
non_adjacent.push((i1, j1, k1)); | |
} | |
non_adjacent.push((len1, len2, 0)); | |
non_adjacent | |
} | |
/* | |
token_sort evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on sorted order of the tokens. | |
pass in two strings, the sorter, and the similarity: | |
token_sort(s1, s2, &TokenCmp::new_sort, &TokenCmp::partial_similarity) | |
token_sort(s1, s2, &TokenCmp::new_sort_join, &TokenCmp::similarity) | |
'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ". | |
*/ | |
pub fn token_sort<'a>( | |
t1: &'a str, | |
t2: &'a str, | |
sorter: &Fn( | |
std::vec::Vec<std::borrow::Cow<'a, str>>, | |
std::vec::Vec<std::borrow::Cow<'a, str>>, | |
) -> TokenCmp<'a>, | |
rat: &Fn(&TokenCmp<'a>) -> u8, | |
) -> u8 { | |
let an = AlphaNumericTokenizer; //NOT INCLUDED IN THIS GIST | |
rat(&sorter(an.sequencer(t1), an.sequencer(t2))) | |
} | |
/* | |
token_set evaluates similarity of the longest substrings (Ratcliff/Obershelp) based on the set of intersection and union of tokens. | |
pass in two strings and the similarity: | |
token_set(s1, s2, &TokenCmp::partial_similarity) | |
token_set(s1, s2, &TokenCmp::similarity) | |
'new_sort' is by default concat (no whitespaces in evaled strings); 'new_sort_join' will be by " ". | |
*/ | |
pub fn token_set<'a>(s1: &'a str, s2: &'a str, rat: &Fn(&TokenCmp<'a>) -> u8) -> u8 { | |
let an = AlphaNumericTokenizer; | |
let (p1, p2) = (an.sequencer(s1), an.sequencer(s2)); | |
let mut s1_i_s2 = p1.intersect(p2.clone()); | |
let mut s1q = p1.uniq(p2.clone()); //diff1to2 | |
let mut s2q = p2.uniq(p1.clone()); //diff2to1 | |
s1_i_s2.sort(); | |
s1q.sort(); | |
s2q.sort(); | |
let s1_i_s2_u_s1q = s1_i_s2.union(s1q.clone()); //combined_1to2 | |
let s1_i_s2_u_s2q = s1_i_s2.union(s2q.clone()); //combined_2to1 | |
vec![ | |
rat(&TokenCmp::new_set(s1_i_s2.clone(), s1_i_s2_u_s1q.clone())), | |
rat(&TokenCmp::new_set(s1_i_s2, s1_i_s2_u_s2q.clone())), | |
rat(&TokenCmp::new_set(s1_i_s2_u_s1q, s1_i_s2_u_s2q)), | |
] | |
.iter() | |
.cloned() | |
.u8_max() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment