Created
May 11, 2023 15:37
-
-
Save rohjay/248ab08f219f390528805f4fa76899bc to your computer and use it in GitHub Desktop.
PHP function to compare strings using the Ratcliff/Obershelp pattern-matching algorithm. Great for checking sameness of words / phrases.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// I PHP-ified Ben's work from here: https://github.com/ben-yocum/gestalt-pattern-matcher (thanks for sharing Ben!) | |
// made a few modifications and added comments to better understand how this works. Enjoy =] | |
function compare_strings(string $input1, string $input2, bool $case_insensitive = true): float { | |
if ( $input1 == '' || $input2 == '' ) { | |
return 0.0; | |
} | |
// Case insensitive is the preferred, but you do you | |
if ( $case_insensitive ) { | |
$input1 = strtolower($input1); | |
$input2 = strtolower($input2); | |
} | |
// Build the foundation of the stack with our comparison strings... | |
$stack = [ | |
$input1, | |
$input2 | |
]; | |
// Set the "score" at zero. | |
$score = 0; | |
// I added this in here due to a lack of understanding how this worked. Keeping it 😜 | |
$prevention_count = 1000; | |
while( count($stack) > 0 && $prevention_count > 0 ) { | |
// Pop the comparison strings off the top... | |
$string2 = array_pop($stack); | |
$string1 = array_pop($stack); | |
// Use a split array of the strings so I don't have to substr with offsets all over the place. | |
$string1_split = str_split($string1); | |
$string2_split = str_split($string2); | |
// Grab the string length once | |
$str1_len = strlen($string1); | |
$str2_len = strlen($string2); | |
// Set up the indicators for sameness | |
$sequence_length = 0; | |
$sequence_index1 = -1; | |
$sequence_index2 = -1; | |
// Loop the length of string 1 | |
for($i = 0; $i < $str1_len; $i++) { | |
// Loop the length of string 2 | |
for($j = 0; $j < $str2_len; $j++) { | |
// Our wildcard to track from our current position out to wherever the strings stop matching | |
$k = 0; | |
while(isset($string1_split[$i+$k]) && isset($string2_split[$j+$k]) && $string1_split[$i+$k] === $string2_split[$j+$k]) { | |
++$k; | |
} | |
// Now that we've seen how far out $k gets us... Let's see if this is streak-worthy ;) | |
if($k > $sequence_length) { | |
$sequence_length = $k; // Our wildcard that tracks how far out a same-streak goes | |
$sequence_index1 = $i; // Where the streak starts in string 1 | |
$sequence_index2 = $j; // Where the streak starts in string 2 | |
} | |
} | |
} | |
// We've looped through both strings independently and found ONE similarity, let's tally the score and add more | |
// things to look at around where this was. | |
if($sequence_length !== 0) { | |
// Tally the score based off the sequence length (remember our wildcard $k from before? This is the biggest | |
// we saw of that) | |
$score += $sequence_length * 2; | |
// If our matching sequence didn't start at the beginning of either string, there could be matches here yet. | |
if( $sequence_index1 !== 0 && $sequence_index2 !== 0 ) { | |
$stack[] = substr($string1, 0, $sequence_index1); | |
$stack[] = substr($string2, 0, $sequence_index2); | |
} | |
// If our matching sequence didn't butt up to the end of either string, there are still potential matches. | |
if( | |
$sequence_index1 + $sequence_length !== $str1_len | |
&& $sequence_index2 + $sequence_length !== $str2_len | |
) { | |
$stack[] = substr($string1, $sequence_index1 + $sequence_length, $str1_len); | |
$stack[] = substr($string2, $sequence_index2 + $sequence_length, $str2_len); | |
} | |
} | |
// Because I never trust this will stop. Never. | |
--$prevention_count; | |
} | |
// This gives us a relative score normalized by the length of the two strings. | |
$strlengths = strlen($input1) + strlen($input2); | |
$rating = $score / ($strlengths != 0 ? $strlengths : 1); // Habit. Never divide a goose egg. | |
// I multiply by 100 here and round to the 5th decimal to give me a Percentage figure. For instance: | |
// 12.34567% match is not a great match. 98.76543% match is pretty good! An exact match is 100. | |
return round($rating * 100, 5); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment