Skip to content

Instantly share code, notes, and snippets.

@arnobaer
Last active August 16, 2018 16:11
Show Gist options
  • Save arnobaer/c8a77a6a3fb9893af263804eb5be86a5 to your computer and use it in GitHub Desktop.
Save arnobaer/c8a77a6a3fb9893af263804eb5be86a5 to your computer and use it in GitHub Desktop.
Simple PHP full text search engine
<?php
include "TextSearch.php";
// Get search needle
$needle = ($argc > 1) ? $argv[1] : "öl";
// Text to search
$text = "Pöllölinnut (Strigiformes) on lintujen lahko, johon kuuluu yli 200 lintulajia noin 27 suvussa. Pöllölintuja tavataan kaikkialla maailmassa Antarktista lukuun ottamatta. Ne metsästävät pääasiassa pieniä nisäkkäitä, hyönteisiä ja muita lintuja, mutta muutamat lajit ovat erikoistuneet kalojen metsästämiseen. Suurin lahkon heimoista on pöllöt. -- https://fi.wikipedia.org/wiki/Pöllölinnut";
// Create search engine
$engine = new \TextSearch\Engine();
// Iterate over results
foreach ($engine->find($needle, $text) as $result)
print_r($result);
<?php namespace TextSearch;
/* Fallback if multibyte string plugin is not available. */
if (!extension_loaded('mbstring')) {
function mb_strlen(...$args) { return strlen(...$args); }
function mb_strpos(...$args) { return strpos(...$args); }
function mb_stripos(...$args) { return stripos(...$args); }
function mb_substr(...$args) { return substr(...$args); }
}
/* Result container for \TextSearch\Engine::find() */
class Result {
public $left;
public $match;
public $right;
public $pos;
public function __construct($left, $match, $right, $pos) {
$this->left = $left;
$this->match = $match;
$this->right = $right;
$this->pos = $pos;
}
protected function span($class, $content) {
return "<span class=\"{$class}\">{$content}</span>";
}
public function toHtml($class="search-result")
{
return $this->span("{$class} {$class}-left", $this->left).
$this->span("{$class} {$class}-match", $this->match).
$this->span("{$class} {$class}-right", $this->right);
}
public function __toString()
{
return "{$this->left}{$this->match}{$this->right}";
}
}
/* Simple full text search engine returning results embeded in surrounding text.
*
* $engine = new \TextSearch\Engine();
* $results = $engine->find("span", "NO-body expects the Spanish Inquisition!");
*
* $results[0];
* // [left] => 'NO-body expects the '
* // [match] => 'Span'
* // [right] => 'ish Inquisition!'
* // [pos] => 20
*
*/
class Engine {
public $width;
public function __construct($width=32) {
$this->width = $width;
}
/* Returns all results for $needle in $haystack. */
public function find($needle, $haystack, $start=0) {
$haystack = $this->strip($haystack);
$results = array();
while (FALSE !== ($pos = mb_stripos($haystack, $needle, $start))) {
$results[] = $this->create_result($haystack, $needle, $pos);
$start = $pos + mb_strlen($needle);
}
return $results;
}
/* Clean text, remove sourounding and multible whitespaces.
* $this->strip(" foo bar baz ");
* // retruns 'foo bar baz'
*/
protected function strip($text) {
return trim(preg_replace('/\s+/', ' ', $text));
}
/* Create a new search result at position $pos. */
protected function create_result($haystack, $needle, $pos) {
$needle_len = mb_strlen($needle);
$left = $this->left_chunk($haystack, $pos);
$right = $this->right_chunk($haystack, $pos + $needle_len);
$match = mb_substr($haystack, $pos, $needle_len);
return new \TextSearch\Result($left, $match, $right, $pos);
}
/* Extract text chunk to the left.
* $this->width = 16;
* $this->left_chunk("Et qui eligendi provident dolore labore quia.", 26);
* // ^
* // returns 'provident '
*/
protected function left_chunk($haystack, $pos) {
$min = 0;
$begin = max($pos - $this->width, $min);
if ($begin > $min)
$begin = mb_strpos($haystack, ' ', $begin) + 1; // skip leading whitespace
return FALSE === $begin ? '' : mb_substr($haystack, $begin, $pos - $begin);
}
/* Extract text chunk to the right.
* $this->width = 16;
* $this->right_chunk("Et qui eligendi provident dolore labore quia.", 32);
* // ^
* // returns ' labore quia.'
*/
protected function right_chunk($haystack, $pos) {
$max = mb_strlen($haystack);
$end = min($pos + $this->width, $max);
if ($end < $max)
$end = mb_strpos($haystack, ' ', $end);
return FALSE === $end ? '' : mb_substr($haystack, $pos, $end - $pos);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment