Last active
August 10, 2018 06:11
-
-
Save wudi/61d695f28fd29ccafca7b99908c94fb0 to your computer and use it in GitHub Desktop.
Demo search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Demo search | |
| * User: eagle<[email protected]> | |
| * Date: 2018/8/10 | |
| * Time: 10:41 | |
| */ | |
| $words = [ | |
| "杭州", "杭州市", "上海", "上海市", "闵行", "闵行区", "莲花路", | |
| "杭州路", "淮海路", "淮海中路", | |
| "人民广场", | |
| "号", "幢", "座", "楼", "弄", "路", "市", "省", | |
| "东路", "南路", "西路", "北路", "中路", "交叉口", "路口", | |
| "小区", "花园", "苑", | |
| ]; | |
| $wordsMap = []; | |
| $maxStep = 1; | |
| $synonyms = [ | |
| [""] | |
| ]; | |
| $stopwords = [ | |
| "此", "此间", "此外", "从", "从而", "打", "待", "但", | |
| "但是", "当", "当着", "到", "得", "的", "的话", "等", "等等", "地" | |
| ]; | |
| $stopWordsMap = []; | |
| // 杭州路18号 -> 杭州、杭州路、18、号 | |
| $sentences = [ | |
| ]; | |
| $indices = [ | |
| // word index -> [sentence index] | |
| // '杭州' => [1], | |
| // '杭州路' => [1], | |
| // '号' => [1, 2], | |
| // '18' => [1], | |
| // '19' => [2], | |
| ]; | |
| function buildIndex($sentence, $index, $maxStep) | |
| { | |
| global $indices; | |
| $words = []; | |
| for ($step = 1; $step <= $maxStep; $step++) { | |
| foreach (splitByWords($sentence, $step) as $word) | |
| $words[] = $word; | |
| } | |
| foreach (array_unique($words) as $word) { | |
| $indices[$word][] = $index; | |
| } | |
| } | |
| function hashIndex($str) | |
| { | |
| return crc32($str); | |
| } | |
| function splitByWords($str, $step) | |
| { | |
| global $stopWordsMap, $wordsMap; | |
| $words = []; | |
| $t = mb_strlen($str); | |
| for ($i = 0; $i < $t; $i++) { | |
| $word = mb_substr($str, $i, $step); | |
| if (is_numeric($word)) { | |
| $words[] = $word; | |
| continue; | |
| } | |
| if (!isset($stopWordsMap[$word]) && isset($wordsMap[$word])) { | |
| $words[] = $word; | |
| } | |
| } | |
| return $words; | |
| } | |
| function sortWords(array &$words) | |
| { | |
| usort($words, function ($a, $b) { | |
| if (strlen($a) == strlen($b)) | |
| return 0; | |
| return strlen($a) > strlen($b) ? -1 : 1; | |
| }); | |
| } | |
| function maxStep(array &$words) | |
| { | |
| $max = 1; | |
| array_walk($words, function ($v) use (&$max) { | |
| $len = mb_strlen($v); | |
| $max = $len > $max ? $len : $max; | |
| }); | |
| return $max; | |
| } | |
| function search($sentence) | |
| { | |
| global $maxStep, $indices; | |
| $words = []; | |
| for ($step = 1; $step <= $maxStep; $step++) { | |
| foreach (splitByWords($sentence, $step) as $word) | |
| $words[] = $word; | |
| } | |
| $ids = []; | |
| foreach (array_unique($words) as $word) { | |
| if (isset($indices[$word])) { | |
| foreach ($indices[$word] as $sid) { | |
| if (!isset($ids[$sid])) | |
| $ids[$sid] = 0; | |
| $ids[$sid]++; | |
| } | |
| } | |
| } | |
| return $ids; | |
| } | |
| $maxStep = maxStep($words); | |
| $wordsMap = array_flip($words); | |
| $stopwordsMap = array_flip($stopwords); | |
| foreach ($sentences as $index => $sentence) { | |
| buildIndex($sentence, $index, $maxStep); | |
| } | |
| echo <<<DOC | |
| 指令列表: | |
| add 句子: 增加待索引语句 | |
| search 句子: 查询与该语句相似度较高的语句列表(权重高优先) | |
| fc 句子: 测试分词结果 | |
| ======================================================\n | |
| DOC; | |
| while (1) { | |
| echo "$ "; | |
| try { | |
| $buffer = trim(fgets(STDIN, 1024)); | |
| if (preg_match('/^add \s*([^\n]+)/i', $buffer, $matches)) { | |
| $sentence = $matches[1]; | |
| $sentences[] = $sentence; | |
| buildIndex($sentence, count($sentences) - 1, $maxStep); | |
| echo "[OK]: '$sentence'\n"; | |
| continue; | |
| } | |
| if (preg_match('/^search \s*([^\n]+)/i', $buffer, $matches)) { | |
| if ($sids = search($matches[1])) { | |
| arsort($sids); | |
| foreach ($sids as $id => $weight) | |
| echo sprintf("[Weight:%d] %s\n", $weight, $sentences[$id]); | |
| } else { | |
| echo "Not Found\n"; | |
| } | |
| continue; | |
| } | |
| if (preg_match('/^fc \s*([^\n]+)/i', $buffer, $matches)) { | |
| $words = []; | |
| for ($step = 1; $step <= $maxStep; $step++) { | |
| foreach (splitByWords($matches[1], $step) as $word) | |
| $words[] = $word; | |
| } | |
| echo implode("、", array_unique($words)) . "\n"; | |
| continue; | |
| } | |
| if (!empty($buffer)) | |
| echo "Invalid command\n"; | |
| } catch (Exception $e) { | |
| echo "ERROR: {$e->getMessage()}\n"; | |
| } | |
| } | |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
→ php search.php [abbf743f] 指令列表: add 句子: 增加待索引语句 search 句子: 查询与该语句相似度较高的语句列表(权重高优先) fc 句子: 测试分词结果 ====================================================== $ $ $ $ add 上海市淮海中路131号 [OK]: '上海市淮海中路131号' $ add 上海市四川中路21号 [OK]: '上海市四川中路21号' $ add 杭州路18号 [OK]: '杭州路18号' $ add 杭州路19号 [OK]: '杭州路19号' $ $ $ search 上海市淮海中路145号 [Weight:8] 上海市淮海中路131号 [Weight:7] 上海市四川中路21号 [Weight:3] 杭州路18号 [Weight:3] 杭州路19号 $ $ $ search 杭州路18楼 [Weight:6] 杭州路18号 [Weight:4] 杭州路19号 [Weight:2] 上海市淮海中路131号 [Weight:2] 上海市四川中路21号 $ $ fc 从杭州路18号与淮海中路路口向南罗马小区 路、1、8、号、杭州、18、中路、路口、小区、杭州路、淮海中路 $