Last active
August 10, 2018 06:11
-
-
Save wudi/61d695f28fd29ccafca7b99908c94fb0 to your computer and use it in GitHub Desktop.
Demo search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Demo search | |
* User: eagle<[email protected]> | |
* Date: 2018/8/10 | |
* Time: 10:41 | |
*/ | |
$words = [ | |
"杭州", "杭州市", "上海", "上海市", "闵行", "闵行区", "莲花路", | |
"杭州路", "淮海路", "淮海中路", | |
"人民广场", | |
"号", "幢", "座", "楼", "弄", "路", "市", "省", | |
"东路", "南路", "西路", "北路", "中路", "交叉口", "路口", | |
"小区", "花园", "苑", | |
]; | |
$wordsMap = []; | |
$maxStep = 1; | |
$synonyms = [ | |
[""] | |
]; | |
$stopwords = [ | |
"此", "此间", "此外", "从", "从而", "打", "待", "但", | |
"但是", "当", "当着", "到", "得", "的", "的话", "等", "等等", "地" | |
]; | |
$stopWordsMap = []; | |
// 杭州路18号 -> 杭州、杭州路、18、号 | |
$sentences = [ | |
]; | |
$indices = [ | |
// word index -> [sentence index] | |
// '杭州' => [1], | |
// '杭州路' => [1], | |
// '号' => [1, 2], | |
// '18' => [1], | |
// '19' => [2], | |
]; | |
function buildIndex($sentence, $index, $maxStep) | |
{ | |
global $indices; | |
$words = []; | |
for ($step = 1; $step <= $maxStep; $step++) { | |
foreach (splitByWords($sentence, $step) as $word) | |
$words[] = $word; | |
} | |
foreach (array_unique($words) as $word) { | |
$indices[$word][] = $index; | |
} | |
} | |
function hashIndex($str) | |
{ | |
return crc32($str); | |
} | |
function splitByWords($str, $step) | |
{ | |
global $stopWordsMap, $wordsMap; | |
$words = []; | |
$t = mb_strlen($str); | |
for ($i = 0; $i < $t; $i++) { | |
$word = mb_substr($str, $i, $step); | |
if (is_numeric($word)) { | |
$words[] = $word; | |
continue; | |
} | |
if (!isset($stopWordsMap[$word]) && isset($wordsMap[$word])) { | |
$words[] = $word; | |
} | |
} | |
return $words; | |
} | |
function sortWords(array &$words) | |
{ | |
usort($words, function ($a, $b) { | |
if (strlen($a) == strlen($b)) | |
return 0; | |
return strlen($a) > strlen($b) ? -1 : 1; | |
}); | |
} | |
function maxStep(array &$words) | |
{ | |
$max = 1; | |
array_walk($words, function ($v) use (&$max) { | |
$len = mb_strlen($v); | |
$max = $len > $max ? $len : $max; | |
}); | |
return $max; | |
} | |
function search($sentence) | |
{ | |
global $maxStep, $indices; | |
$words = []; | |
for ($step = 1; $step <= $maxStep; $step++) { | |
foreach (splitByWords($sentence, $step) as $word) | |
$words[] = $word; | |
} | |
$ids = []; | |
foreach (array_unique($words) as $word) { | |
if (isset($indices[$word])) { | |
foreach ($indices[$word] as $sid) { | |
if (!isset($ids[$sid])) | |
$ids[$sid] = 0; | |
$ids[$sid]++; | |
} | |
} | |
} | |
return $ids; | |
} | |
$maxStep = maxStep($words); | |
$wordsMap = array_flip($words); | |
$stopwordsMap = array_flip($stopwords); | |
foreach ($sentences as $index => $sentence) { | |
buildIndex($sentence, $index, $maxStep); | |
} | |
echo <<<DOC | |
指令列表: | |
add 句子: 增加待索引语句 | |
search 句子: 查询与该语句相似度较高的语句列表(权重高优先) | |
fc 句子: 测试分词结果 | |
======================================================\n | |
DOC; | |
while (1) { | |
echo "$ "; | |
try { | |
$buffer = trim(fgets(STDIN, 1024)); | |
if (preg_match('/^add \s*([^\n]+)/i', $buffer, $matches)) { | |
$sentence = $matches[1]; | |
$sentences[] = $sentence; | |
buildIndex($sentence, count($sentences) - 1, $maxStep); | |
echo "[OK]: '$sentence'\n"; | |
continue; | |
} | |
if (preg_match('/^search \s*([^\n]+)/i', $buffer, $matches)) { | |
if ($sids = search($matches[1])) { | |
arsort($sids); | |
foreach ($sids as $id => $weight) | |
echo sprintf("[Weight:%d] %s\n", $weight, $sentences[$id]); | |
} else { | |
echo "Not Found\n"; | |
} | |
continue; | |
} | |
if (preg_match('/^fc \s*([^\n]+)/i', $buffer, $matches)) { | |
$words = []; | |
for ($step = 1; $step <= $maxStep; $step++) { | |
foreach (splitByWords($matches[1], $step) as $word) | |
$words[] = $word; | |
} | |
echo implode("、", array_unique($words)) . "\n"; | |
continue; | |
} | |
if (!empty($buffer)) | |
echo "Invalid command\n"; | |
} catch (Exception $e) { | |
echo "ERROR: {$e->getMessage()}\n"; | |
} | |
} | |
Author
wudi
commented
Aug 10, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment