Created
September 24, 2009 17:14
-
-
Save sotarok/192879 to your computer and use it in GitHub Desktop.
rapid and stupid search engine script for PHP source code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
._index_* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Hyper Pudding | |
* - rapid and stupid search engine script for PHP source code. - | |
* | |
* @author sotarok | |
* @versoin 0.1.2 | |
* @license The MIT License | |
* @requir PHP > 5.3 | |
*/ | |
namespace pudding\search; | |
ini_set("memory_limit", -1); | |
const VERSION = "0.1.2"; | |
$version = VERSION; | |
echo <<<EEE | |
** --------------------------------------------------------------- ** | |
Hyper Pudding | |
- rapid and stupid search engine script for PHP source code. - | |
pudding\index_builder | |
@license The MIT License | |
@author sotarok <sotaro.k /at/ gmail.com> | |
@version {$version} | |
** --------------------------------------------------------------- ** | |
EEE; | |
class index_builder | |
{ | |
public $is_debug; | |
protected $_basedir = ""; | |
protected $_index_filename = ""; | |
protected $_source_list = array(); | |
protected $_token_index = 1; | |
protected $_token_list = array(); | |
protected $_tokenize_tokens = array( | |
T_STRING_CAST, | |
T_STRING_VARNAME, | |
T_STRING, | |
T_VARIABLE, | |
); | |
protected $_inverted_index = array(); | |
public function __construct($basedir, Array $options = array()) | |
{ | |
$this->_basedir = $basedir; | |
$this->_index_filename = "._index_" . md5($this->_basedir); | |
$this->_options = new \ArrayObject($options); | |
$this->_tokenize_tokens = $this->get_options("tokenize_tokens", $this->_tokenize_tokens); | |
$this->is_debug = $this->get_options("is_debug", false); | |
} | |
public function set_options(ArrayObject $option) { | |
$this->_options = $options; | |
} | |
public function get_options($key, $default = "") | |
{ | |
return isset($this->_options->$key) ? $this->_options->$key : $default; | |
} | |
public function build_index() | |
{ | |
$this->_source_list = $this->crawl_recursive($this->_basedir); | |
$sec = microtime(true); | |
$this->tokenizer(); | |
$esec = microtime(true); | |
echo "Index Built: ", $esec - $sec, PHP_EOL; | |
} | |
public function search($keyword) | |
{ | |
if (isset($this->_inverted_index[$keyword])) { | |
return $this->scoring($this->_inverted_index[$keyword]); | |
} | |
return false; | |
} | |
public function scoring(&$ii) | |
{ | |
$scored = array(); | |
$tmp = array(); | |
foreach ($ii as $k => $v) { | |
if (isset($scored[$v[0]])) { | |
$scored[$v[0]]['count']++; | |
$scored[$v[0]]['pos'][] = $v[1]; | |
} | |
else { | |
$scored[$v[0]] = array( | |
'count' => 0, | |
'pos' => array($v[1]), | |
); | |
} | |
} | |
uasort($scored, function($v1, $v2) { | |
return $v1['count'] < $v2['count']; | |
}); | |
return $scored; | |
} | |
public function tokenizer() | |
{ | |
foreach ($this->_source_list as $source) { | |
$this->info($source, PHP_EOL); | |
foreach (token_get_all(file_get_contents($source)) as $token) { | |
if (in_array($token[0], $this->_tokenize_tokens)) { | |
$this->_token_list[] = array($source, $token[1], $token[2]); | |
if (!isset($this->_inverted_index[$token[1]])) { | |
$this->_inverted_index[$token[1]] = array(); | |
} | |
$this->_inverted_index[$token[1]][] = array($source, $token[2]); | |
} | |
} | |
$this->info("\t", memory_get_usage()/1024/1024, " MB ", PHP_EOL); | |
} | |
} | |
public function save() | |
{ | |
file_put_contents($this->filename(), serialize($this->_inverted_index)); | |
return $this; | |
} | |
public function load() | |
{ | |
$this->_inverted_index = unserialize(file_get_contents($this->filename())); | |
return $this; | |
} | |
public function filename($filename = null) | |
{ | |
if ($filename === null) { | |
return $this->_index_filename; | |
} | |
else { | |
$this->_index_filename = $filename; | |
return $this; | |
} | |
} | |
public function index_file_exists() | |
{ | |
return file_exists($this->filename()); | |
} | |
public function delete_index_file() | |
{ | |
if (!unlink($this->filename())) { | |
throw new \Exception(sprintf("Cannot delete index file (%s).", $this->filename())); | |
} | |
return $this; | |
} | |
public function info() | |
{ | |
if ($this->is_debug) { | |
fprintf(STDERR, join(" ", func_get_args())); | |
} | |
} | |
public function crawl_recursive ($dirname) | |
{ | |
$files = array(); | |
foreach (glob($dirname . "/*") as $file) { | |
if (is_dir($file)) { | |
$files = array_merge($files, $this->crawl_recursive($file)); | |
} | |
else { | |
if (preg_match('/.+\.(' . join($this->get_options("ext", array("php",))) .')$/', $file)) { | |
$files[] = $file; | |
} | |
} | |
} | |
return $files; | |
} | |
} | |
if ($argc != 2) { | |
fprintf(STDERR, " | |
Invalid arguments: | |
usege: php %s base_dir | |
base_dir - searching source file basedir. script searcing under this directory. | |
", $argv[0]); | |
exit(1); | |
} | |
$builder = new index_builder(rtrim($argv[1], "/")); | |
if ($builder->index_file_exists()) { | |
echo "Indexed file found (created at ", date("Y-m-d H:i:s", filemtime($builder->filename())), ") / load ? [Y/n]:"; | |
$ans = trim(fgets(STDIN)); | |
if (strtolower($ans) == 'y' || empty($ans)) { | |
$builder->load(); | |
} | |
else { | |
$builder->delete_index_file(); | |
$builder->build_index(); | |
} | |
} | |
else { | |
$builder->build_index(); | |
} | |
echo " mem: ", printf("%.5f", memory_get_usage()/1024/1024), " MB used.", PHP_EOL; | |
echo <<<EEE | |
Input Search Keyword: | |
(if you want to end this script, input empty string) | |
EEE; | |
echo "> "; | |
$key = trim(fgets(STDIN)); | |
while(!empty($key)) { | |
echo " searching $key", PHP_EOL; | |
$sec = microtime(true); | |
$res = $builder->search($key); | |
$esec = microtime(true); | |
if ($res) { | |
foreach ($res as $k => $r) { | |
echo " ", sprintf("%-50s", str_replace($argv[1], "", $k)), " on line ", join(", ", $r['pos']), PHP_EOL; | |
} | |
} | |
else { | |
echo "Not Found.", PHP_EOL; | |
} | |
echo " sec: ", printf("%.5f", $esec - $sec), PHP_EOL; | |
echo "> "; | |
$key = trim(fgets(STDIN)); | |
} | |
echo "Do you want to save? [y/N]: "; | |
$ans = trim(fgets(STDIN)); | |
if (strtolower($ans) == 'y') { | |
$builder->save(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment