wudi · August 10, 2018 06:11 · wudi · Aug 10, 2018
diff --git a/search.php b/search.php
 <?php
 /**
 * Demo search
 * User: eagle<[email protected]>
 * Date: 2018/8/10
 * Time: 10:41
 */

 $words = [
    "杭州", "杭州市", "上海", "上海市", "闵行", "闵行区", "莲花路",
    "杭州路", "淮海路", "淮海中路",
    "人民广场",
    "号", "幢", "座", "楼", "弄", "路", "市", "省",
    "东路", "南路", "西路", "北路", "中路", "交叉口", "路口",
    "小区", "花园", "苑",
 ];
 $wordsMap = [];
 $maxStep = 1;

 $synonyms = [
    [""]
 ];

 $stopwords = [
    "此", "此间", "此外", "从", "从而", "打", "待", "但",
    "但是", "当", "当着", "到", "得", "的", "的话", "等", "等等", "地"
 ];
 $stopWordsMap = [];

 // 杭州路18号  -> 杭州、杭州路、18、号
 $sentences = [

 ];
 $indices = [
    // word index  ->  [sentence index]
 //    '杭州'  => [1],
 //    '杭州路' => [1],
 //    '号'   => [1, 2],
 //    '18'  => [1],
 //    '19'  => [2],
 ];

 function buildIndex($sentence, $index, $maxStep)
 {
    global $indices;
    $words = [];
    for ($step = 1; $step <= $maxStep; $step++) {
        foreach (splitByWords($sentence, $step) as $word)
            $words[] = $word;
    }
    foreach (array_unique($words) as $word) {
        $indices[$word][] = $index;
    }
 }

 function hashIndex($str)
 {
    return crc32($str);
 }

 function splitByWords($str, $step)
 {
    global $stopWordsMap, $wordsMap;

    $words = [];
    $t = mb_strlen($str);
    for ($i = 0; $i < $t; $i++) {
        $word = mb_substr($str, $i, $step);
        if (is_numeric($word)) {
            $words[] = $word;
            continue;
        }
        if (!isset($stopWordsMap[$word]) && isset($wordsMap[$word])) {
            $words[] = $word;
        }
    }
    return $words;
 }

 function sortWords(array &$words)
 {
    usort($words, function ($a, $b) {
        if (strlen($a) == strlen($b))
            return 0;
        return strlen($a) > strlen($b) ? -1 : 1;
    });
 }

 function maxStep(array &$words)
 {
    $max = 1;
    array_walk($words, function ($v) use (&$max) {
        $len = mb_strlen($v);
        $max = $len > $max ? $len : $max;
    });
    return $max;
 }

 function search($sentence)
 {
    global $maxStep, $indices;
    $words = [];
    for ($step = 1; $step <= $maxStep; $step++) {
        foreach (splitByWords($sentence, $step) as $word)
            $words[] = $word;
    }
    $ids = [];
    foreach (array_unique($words) as $word) {
        if (isset($indices[$word])) {
            foreach ($indices[$word] as $sid) {
                if (!isset($ids[$sid]))
                    $ids[$sid] = 0;
                $ids[$sid]++;
            }
        }
    }
    return $ids;
 }

 $maxStep = maxStep($words);
 $wordsMap = array_flip($words);
 $stopwordsMap = array_flip($stopwords);

 foreach ($sentences as $index => $sentence) {
    buildIndex($sentence, $index, $maxStep);
 }

 echo <<<DOC
 指令列表：
    add 句子:     增加待索引语句
    search 句子:  查询与该语句相似度较高的语句列表（权重高优先）
    fc 句子:      测试分词结果
 ======================================================\n
 DOC;


 while (1) {
    echo "$ ";
    try {

        $buffer = trim(fgets(STDIN, 1024));
        if (preg_match('/^add \s*([^\n]+)/i', $buffer, $matches)) {
            $sentence = $matches[1];
            $sentences[] = $sentence;
            buildIndex($sentence, count($sentences) - 1, $maxStep);
            echo "[OK]: '$sentence'\n";
            continue;
        }

        if (preg_match('/^search \s*([^\n]+)/i', $buffer, $matches)) {
            if ($sids = search($matches[1])) {
                arsort($sids);
                foreach ($sids as $id => $weight)
                    echo sprintf("[Weight:%d] %s\n", $weight, $sentences[$id]);
            } else {
                echo "Not Found\n";
            }
            continue;
        }

        if (preg_match('/^fc \s*([^\n]+)/i', $buffer, $matches)) {
            $words = [];
            for ($step = 1; $step <= $maxStep; $step++) {
                foreach (splitByWords($matches[1], $step) as $word)
                    $words[] = $word;
            }
            echo implode("、", array_unique($words)) . "\n";
            continue;
        }

        if (!empty($buffer))
            echo "Invalid command\n";
    } catch (Exception $e) {
        echo "ERROR: {$e->getMessage()}\n";
    }
 }
	<?php
	/**
	* Demo search
	* User: eagle<[email protected]>
	* Date: 2018/8/10
	* Time: 10:41
	*/

	$words = [
	"杭州", "杭州市", "上海", "上海市", "闵行", "闵行区", "莲花路",
	"杭州路", "淮海路", "淮海中路",
	"人民广场",
	"号", "幢", "座", "楼", "弄", "路", "市", "省",
	"东路", "南路", "西路", "北路", "中路", "交叉口", "路口",
	"小区", "花园", "苑",
	];
	$wordsMap = [];
	$maxStep = 1;

	$synonyms = [
	[""]
	];

	$stopwords = [
	"此", "此间", "此外", "从", "从而", "打", "待", "但",
	"但是", "当", "当着", "到", "得", "的", "的话", "等", "等等", "地"
	];
	$stopWordsMap = [];

	// 杭州路18号 -> 杭州、杭州路、18、号
	$sentences = [

	];
	$indices = [
	// word index -> [sentence index]
	// '杭州' => [1],
	// '杭州路' => [1],
	// '号' => [1, 2],
	// '18' => [1],
	// '19' => [2],
	];

	function buildIndex($sentence, $index, $maxStep)
	{
	global $indices;
	$words = [];
	for ($step = 1; $step <= $maxStep; $step++) {
	foreach (splitByWords($sentence, $step) as $word)
	$words[] = $word;
	}
	foreach (array_unique($words) as $word) {
	$indices[$word][] = $index;
	}
	}

	function hashIndex($str)
	{
	return crc32($str);
	}

	function splitByWords($str, $step)
	{
	global $stopWordsMap, $wordsMap;

	$words = [];
	$t = mb_strlen($str);
	for ($i = 0; $i < $t; $i++) {
	$word = mb_substr($str, $i, $step);
	if (is_numeric($word)) {
	$words[] = $word;
	continue;
	}
	if (!isset($stopWordsMap[$word]) && isset($wordsMap[$word])) {
	$words[] = $word;
	}
	}
	return $words;
	}

	function sortWords(array &$words)
	{
	usort($words, function ($a, $b) {
	if (strlen($a) == strlen($b))
	return 0;
	return strlen($a) > strlen($b) ? -1 : 1;
	});
	}

	function maxStep(array &$words)
	{
	$max = 1;
	array_walk($words, function ($v) use (&$max) {
	$len = mb_strlen($v);
	$max = $len > $max ? $len : $max;
	});
	return $max;
	}

	function search($sentence)
	{
	global $maxStep, $indices;
	$words = [];
	for ($step = 1; $step <= $maxStep; $step++) {
	foreach (splitByWords($sentence, $step) as $word)
	$words[] = $word;
	}
	$ids = [];
	foreach (array_unique($words) as $word) {
	if (isset($indices[$word])) {
	foreach ($indices[$word] as $sid) {
	if (!isset($ids[$sid]))
	$ids[$sid] = 0;
	$ids[$sid]++;
	}
	}
	}
	return $ids;
	}

	$maxStep = maxStep($words);
	$wordsMap = array_flip($words);
	$stopwordsMap = array_flip($stopwords);

	foreach ($sentences as $index => $sentence) {
	buildIndex($sentence, $index, $maxStep);
	}

	echo <<<DOC
	指令列表：
	add 句子: 增加待索引语句
	search 句子: 查询与该语句相似度较高的语句列表（权重高优先）
	fc 句子: 测试分词结果
	======================================================\n
	DOC;


	while (1) {
	echo "$ ";
	try {

	$buffer = trim(fgets(STDIN, 1024));
	if (preg_match('/^add \s*([^\n]+)/i', $buffer, $matches)) {
	$sentence = $matches[1];
	$sentences[] = $sentence;
	buildIndex($sentence, count($sentences) - 1, $maxStep);
	echo "[OK]: '$sentence'\n";
	continue;
	}

	if (preg_match('/^search \s*([^\n]+)/i', $buffer, $matches)) {
	if ($sids = search($matches[1])) {
	arsort($sids);
	foreach ($sids as $id => $weight)
	echo sprintf("[Weight:%d] %s\n", $weight, $sentences[$id]);
	} else {
	echo "Not Found\n";
	}
	continue;
	}

	if (preg_match('/^fc \s*([^\n]+)/i', $buffer, $matches)) {
	$words = [];
	for ($step = 1; $step <= $maxStep; $step++) {
	foreach (splitByWords($matches[1], $step) as $word)
	$words[] = $word;
	}
	echo implode("、", array_unique($words)) . "\n";
	continue;
	}

	if (!empty($buffer))
	echo "Invalid command\n";
	} catch (Exception $e) {
	echo "ERROR: {$e->getMessage()}\n";
	}
	}
No results found