Created
May 11, 2011 21:16
-
-
Save fruit/967374 to your computer and use it in GitHub Desktop.
String comparing algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* This class implements string comparison algorithm | |
* based on character pair similarity | |
* | |
* @link http://www.catalysoft.com/articles/StrikeAMatch.html | |
* @author Ilya Sabelnikov <[email protected]> | |
*/ | |
class SimilarityTool | |
{ | |
/** | |
* Compares the two strings based on letter pair matches | |
* | |
* @param string $str1 | |
* @param string $str2 | |
* @return float | |
*/ | |
public static function compareStrings ($str1, $str2) | |
{ | |
if (0 == strlen($str1) + strlen($str2) || 0 == strcmp($str1, $str2)) | |
{ | |
return 1.0; | |
} | |
$pairs1 = self::wordLetterPairs($str1); | |
$pairs2 = self::wordLetterPairs($str2); | |
$pairs1Count = count($pairs1); | |
$pairs2Count = count($pairs2); | |
$union = $pairs1Count + $pairs2Count; | |
if (0 == $union) | |
{ | |
return 0.0; | |
} | |
$intersection = 0; | |
for ($i = 0; $i < $pairs1Count; $i ++ ) | |
{ | |
for ($j = 0; $j < $pairs2Count; $j ++ ) | |
{ | |
if (isset($pairs2[$j])) | |
{ | |
if ($pairs1[$i] == $pairs2[$j]) | |
{ | |
$intersection ++; | |
unset($pairs2[$j]); | |
break; | |
} | |
} | |
} | |
} | |
return (2 * $intersection) / $union; | |
} | |
/** | |
* Gets all letter pairs for each individual word in the string | |
* | |
* @param string $str | |
* @return array | |
*/ | |
private static function wordLetterPairs ($str) | |
{ | |
$allPairs = array(); | |
// Tokenize the string and put the tokens/words into an array | |
$words = explode(' ', $str); | |
// For each word | |
foreach ($words as $word) | |
{ | |
if (! $word) | |
{ | |
continue; | |
} | |
// Find the pairs of characters | |
$numPairs = mb_strlen($word, 'UTF-8') - 1; | |
for ($i = 0; $i < $numPairs; $i ++) | |
{ | |
$allPairs[] = mb_substr($word, $i, 2, 'UTF-8'); | |
} | |
} | |
return $allPairs; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment