Last active
May 29, 2016 08:54
-
-
Save esase/6370f3f692986af49431 to your computer and use it in GitHub Desktop.
Suffix array [algorithm]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A data type that computes the suffix array of a string | |
*/ | |
class SuffixArray | |
{ | |
/** | |
* Text | |
* @var string | |
*/ | |
protected $text; | |
/** | |
* Text length | |
* @var integer | |
*/ | |
protected $textLength; | |
/** | |
* Suffix array positions | |
* @var array | |
*/ | |
protected $suffixArrayPositions = array(); | |
/** | |
* Class constructor | |
* | |
* @param string $text | |
* @param integer $maxSearchSize - limit for stored substrings | |
*/ | |
public function __construct($text, $maxSearchSize = 8) | |
{ | |
$this->text = $text; | |
$this->textLength = strlen($this->text); | |
// fill the suffix array | |
$index = 0; | |
$count = $this->textLength; | |
$suffixArray = array(); | |
for ($i = 1; $i <= $this->textLength; $i++) { | |
$suffixArray[$i] = substr($this->text, $index, $maxSearchSize); | |
$index++; | |
$count--; | |
} | |
// sort the suffix array | |
asort($suffixArray, SORT_STRING); | |
$this->suffixArrayPositions = array_keys($suffixArray); | |
unset($suffixArray); | |
} | |
/** | |
* Binary search | |
* | |
* @param string $string | |
* @param integer $left | |
* @param integer $right | |
* @param array $matches | |
* @return void | |
*/ | |
protected function binSearch($string, $left, $right, array &$matches) | |
{ | |
if ($left > $right) { | |
return; | |
} | |
$middle = ceil(($left + $right) / 2); | |
// get the first term letter | |
$firstStringLetter = $string{0}; | |
$stringLength = strlen($string); | |
// get the first suffix letter | |
$firstSuffixLetter = substr($this->text, $this->suffixArrayPositions[$middle] - 1, 1); | |
// compare the first term and suffix latters | |
if ($firstStringLetter == $firstSuffixLetter) { | |
// remember the found term's position | |
$offset = $this->suffixArrayPositions[$middle] - 1; | |
if ($string == substr($this->text, $offset, $stringLength)) { | |
$matches[] = array( | |
'start' => $offset, | |
'length' => $stringLength | |
); | |
} | |
// walking up or bottom within last borders | |
$this->binSearch($string, $left, $middle - 1, $matches); // walking up | |
$this->binSearch($string, $middle + 1, $right, $matches); // walking down | |
} | |
else { | |
$firstSuffixLetter < $firstStringLetter | |
? $this->binSearch($string, $middle + 1, $right, $matches) // walking down | |
: $this->binSearch($string , $left, $middle - 1, $matches); // walking up | |
} | |
} | |
/** | |
* Search string | |
* | |
* @param string $string | |
* @return array | |
*/ | |
public function searchString($string) | |
{ | |
$matches = array(); | |
if (!$this->textLength) { | |
return $matches; | |
} | |
$this->binSearch($string, 0, $this->textLength - 1, $matches); | |
return $matches; | |
} | |
/** | |
* Get longest common prefix of 2 strings. | |
* | |
* @param string $string1 | |
* @param string $string2 | |
* @return integer | |
*/ | |
public function getLCP($string1, $string2) | |
{ | |
$n = min(strlen($string1), strlen($string2)); | |
for ($i = 0; $i < $n; $i++) { | |
if ($string1[$i] != $string2[$i]) { | |
return $i; | |
} | |
} | |
return $n; | |
} | |
/** | |
* Get count of distinct substrings | |
* | |
* @return integer | |
*/ | |
public function getCountDistinctSubstrings() | |
{ | |
if (!$this->textLength) { | |
return 0; | |
} | |
// first suffix length | |
$count = strlen(substr($this->text, $this->suffixArrayPositions[0] - 1, $this->textLength)); | |
foreach($this->suffixArrayPositions as $index => $position) { | |
if (!isset($this->suffixArrayPositions[$index + 1])) { | |
break; | |
} | |
$currentSuffix = substr($this->text, | |
$this->suffixArrayPositions[$index] - 1, $this->textLength); | |
$nextSuffix = substr($this->text, | |
$this->suffixArrayPositions[$index + 1] - 1, $this->textLength); | |
$count += strlen($nextSuffix) - $this->getLCP($currentSuffix, $nextSuffix); | |
} | |
return $count; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment