Skip to content

Instantly share code, notes, and snippets.

@sotarok
Created September 24, 2009 17:14
Show Gist options
  • Save sotarok/192879 to your computer and use it in GitHub Desktop.
Save sotarok/192879 to your computer and use it in GitHub Desktop.
rapid and stupid search engine script for PHP source code.
<?php
/**
* Hyper Pudding
* - rapid and stupid search engine script for PHP source code. -
*
* @author sotarok
* @versoin 0.1.2
* @license The MIT License
* @requir PHP > 5.3
*/
namespace pudding\search;
ini_set("memory_limit", -1);
const VERSION = "0.1.2";
$version = VERSION;
echo <<<EEE
** --------------------------------------------------------------- **
Hyper Pudding
- rapid and stupid search engine script for PHP source code. -
pudding\index_builder
@license The MIT License
@author sotarok <sotaro.k /at/ gmail.com>
@version {$version}
** --------------------------------------------------------------- **
EEE;
class index_builder
{
public $is_debug;
protected $_basedir = "";
protected $_index_filename = "";
protected $_source_list = array();
protected $_token_index = 1;
protected $_token_list = array();
protected $_tokenize_tokens = array(
T_STRING_CAST,
T_STRING_VARNAME,
T_STRING,
T_VARIABLE,
);
protected $_inverted_index = array();
public function __construct($basedir, Array $options = array())
{
$this->_basedir = $basedir;
$this->_index_filename = "._index_" . md5($this->_basedir);
$this->_options = new \ArrayObject($options);
$this->_tokenize_tokens = $this->get_options("tokenize_tokens", $this->_tokenize_tokens);
$this->is_debug = $this->get_options("is_debug", false);
}
public function set_options(ArrayObject $option) {
$this->_options = $options;
}
public function get_options($key, $default = "")
{
return isset($this->_options->$key) ? $this->_options->$key : $default;
}
public function build_index()
{
$this->_source_list = $this->crawl_recursive($this->_basedir);
$sec = microtime(true);
$this->tokenizer();
$esec = microtime(true);
echo "Index Built: ", $esec - $sec, PHP_EOL;
}
public function search($keyword)
{
if (isset($this->_inverted_index[$keyword])) {
return $this->scoring($this->_inverted_index[$keyword]);
}
return false;
}
public function scoring(&$ii)
{
$scored = array();
$tmp = array();
foreach ($ii as $k => $v) {
if (isset($scored[$v[0]])) {
$scored[$v[0]]['count']++;
$scored[$v[0]]['pos'][] = $v[1];
}
else {
$scored[$v[0]] = array(
'count' => 0,
'pos' => array($v[1]),
);
}
}
uasort($scored, function($v1, $v2) {
return $v1['count'] < $v2['count'];
});
return $scored;
}
public function tokenizer()
{
foreach ($this->_source_list as $source) {
$this->info($source, PHP_EOL);
foreach (token_get_all(file_get_contents($source)) as $token) {
if (in_array($token[0], $this->_tokenize_tokens)) {
$this->_token_list[] = array($source, $token[1], $token[2]);
if (!isset($this->_inverted_index[$token[1]])) {
$this->_inverted_index[$token[1]] = array();
}
$this->_inverted_index[$token[1]][] = array($source, $token[2]);
}
}
$this->info("\t", memory_get_usage()/1024/1024, " MB ", PHP_EOL);
}
}
public function save()
{
file_put_contents($this->filename(), serialize($this->_inverted_index));
return $this;
}
public function load()
{
$this->_inverted_index = unserialize(file_get_contents($this->filename()));
return $this;
}
public function filename($filename = null)
{
if ($filename === null) {
return $this->_index_filename;
}
else {
$this->_index_filename = $filename;
return $this;
}
}
public function index_file_exists()
{
return file_exists($this->filename());
}
public function delete_index_file()
{
if (!unlink($this->filename())) {
throw new \Exception(sprintf("Cannot delete index file (%s).", $this->filename()));
}
return $this;
}
public function info()
{
if ($this->is_debug) {
fprintf(STDERR, join(" ", func_get_args()));
}
}
public function crawl_recursive ($dirname)
{
$files = array();
foreach (glob($dirname . "/*") as $file) {
if (is_dir($file)) {
$files = array_merge($files, $this->crawl_recursive($file));
}
else {
if (preg_match('/.+\.(' . join($this->get_options("ext", array("php",))) .')$/', $file)) {
$files[] = $file;
}
}
}
return $files;
}
}
if ($argc != 2) {
fprintf(STDERR, "
Invalid arguments:
usege: php %s base_dir
base_dir - searching source file basedir. script searcing under this directory.
", $argv[0]);
exit(1);
}
$builder = new index_builder(rtrim($argv[1], "/"));
if ($builder->index_file_exists()) {
echo "Indexed file found (created at ", date("Y-m-d H:i:s", filemtime($builder->filename())), ") / load ? [Y/n]:";
$ans = trim(fgets(STDIN));
if (strtolower($ans) == 'y' || empty($ans)) {
$builder->load();
}
else {
$builder->delete_index_file();
$builder->build_index();
}
}
else {
$builder->build_index();
}
echo " mem: ", printf("%.5f", memory_get_usage()/1024/1024), " MB used.", PHP_EOL;
echo <<<EEE
Input Search Keyword:
(if you want to end this script, input empty string)
EEE;
echo "> ";
$key = trim(fgets(STDIN));
while(!empty($key)) {
echo " searching $key", PHP_EOL;
$sec = microtime(true);
$res = $builder->search($key);
$esec = microtime(true);
if ($res) {
foreach ($res as $k => $r) {
echo " ", sprintf("%-50s", str_replace($argv[1], "", $k)), " on line ", join(", ", $r['pos']), PHP_EOL;
}
}
else {
echo "Not Found.", PHP_EOL;
}
echo " sec: ", printf("%.5f", $esec - $sec), PHP_EOL;
echo "> ";
$key = trim(fgets(STDIN));
}
echo "Do you want to save? [y/N]: ";
$ans = trim(fgets(STDIN));
if (strtolower($ans) == 'y') {
$builder->save();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment