Skip to content

Instantly share code, notes, and snippets.

@esase
Last active May 29, 2016 08:54
Show Gist options
  • Save esase/6370f3f692986af49431 to your computer and use it in GitHub Desktop.
Save esase/6370f3f692986af49431 to your computer and use it in GitHub Desktop.
Suffix array [algorithm]
<?php
/**
* A data type that computes the suffix array of a string
*/
class SuffixArray
{
/**
* Text
* @var string
*/
protected $text;
/**
* Text length
* @var integer
*/
protected $textLength;
/**
* Suffix array positions
* @var array
*/
protected $suffixArrayPositions = array();
/**
* Class constructor
*
* @param string $text
* @param integer $maxSearchSize - limit for stored substrings
*/
public function __construct($text, $maxSearchSize = 8)
{
$this->text = $text;
$this->textLength = strlen($this->text);
// fill the suffix array
$index = 0;
$count = $this->textLength;
$suffixArray = array();
for ($i = 1; $i <= $this->textLength; $i++) {
$suffixArray[$i] = substr($this->text, $index, $maxSearchSize);
$index++;
$count--;
}
// sort the suffix array
asort($suffixArray, SORT_STRING);
$this->suffixArrayPositions = array_keys($suffixArray);
unset($suffixArray);
}
/**
* Binary search
*
* @param string $string
* @param integer $left
* @param integer $right
* @param array $matches
* @return void
*/
protected function binSearch($string, $left, $right, array &$matches)
{
if ($left > $right) {
return;
}
$middle = ceil(($left + $right) / 2);
// get the first term letter
$firstStringLetter = $string{0};
$stringLength = strlen($string);
// get the first suffix letter
$firstSuffixLetter = substr($this->text, $this->suffixArrayPositions[$middle] - 1, 1);
// compare the first term and suffix latters
if ($firstStringLetter == $firstSuffixLetter) {
// remember the found term's position
$offset = $this->suffixArrayPositions[$middle] - 1;
if ($string == substr($this->text, $offset, $stringLength)) {
$matches[] = array(
'start' => $offset,
'length' => $stringLength
);
}
// walking up or bottom within last borders
$this->binSearch($string, $left, $middle - 1, $matches); // walking up
$this->binSearch($string, $middle + 1, $right, $matches); // walking down
}
else {
$firstSuffixLetter < $firstStringLetter
? $this->binSearch($string, $middle + 1, $right, $matches) // walking down
: $this->binSearch($string , $left, $middle - 1, $matches); // walking up
}
}
/**
* Search string
*
* @param string $string
* @return array
*/
public function searchString($string)
{
$matches = array();
if (!$this->textLength) {
return $matches;
}
$this->binSearch($string, 0, $this->textLength - 1, $matches);
return $matches;
}
/**
* Get longest common prefix of 2 strings.
*
* @param string $string1
* @param string $string2
* @return integer
*/
public function getLCP($string1, $string2)
{
$n = min(strlen($string1), strlen($string2));
for ($i = 0; $i < $n; $i++) {
if ($string1[$i] != $string2[$i]) {
return $i;
}
}
return $n;
}
/**
* Get count of distinct substrings
*
* @return integer
*/
public function getCountDistinctSubstrings()
{
if (!$this->textLength) {
return 0;
}
// first suffix length
$count = strlen(substr($this->text, $this->suffixArrayPositions[0] - 1, $this->textLength));
foreach($this->suffixArrayPositions as $index => $position) {
if (!isset($this->suffixArrayPositions[$index + 1])) {
break;
}
$currentSuffix = substr($this->text,
$this->suffixArrayPositions[$index] - 1, $this->textLength);
$nextSuffix = substr($this->text,
$this->suffixArrayPositions[$index + 1] - 1, $this->textLength);
$count += strlen($nextSuffix) - $this->getLCP($currentSuffix, $nextSuffix);
}
return $count;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment