-
-
Save onlytiancai/1180139 to your computer and use it in GitHub Desktop.
javascript实现的正向最大匹配机械分词
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
<html xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | |
<title>test</title> | |
</head> | |
<body> | |
<script type="text/javascript"> | |
/* | |
* @description 原始词库 | |
*/ | |
var word_dict = ['中', '华', '人', '民', '人民', '共和', '国', '共和国','伟大','国家']; | |
/* | |
* @description 格式化词库,按长度分成若干组,并按长度倒序排列 | |
* @param {Array of String} dict 全局词库 | |
* @return {Object} 格式化后的词库 | |
*/ | |
var format_dict = function (dict) { | |
//1、把词库按长度分成若干组,key为词长度,value为此长度的词列表 | |
var map = {}; | |
for (var i = 0; i < dict.length; i++) { | |
var word = dict[i]; | |
if (!map[word.length]) | |
map[word.length] = []; | |
map[word.length].push(word); | |
} | |
//2、声明一个长度数组,按词的长度倒序排列 | |
var len_array = []; | |
for (var len in map) { | |
len_array.push(len); | |
} | |
len_array.sort(function (a, b) { return parseInt(b) - parseInt(a) }); | |
//3、声明一个词库数组,每个元素是一组等长词,且按长度倒序排列 | |
var result = []; | |
for (var i = 0; i < len_array.length; i++) { | |
var key = len_array[i]; | |
result.push(map[key]); | |
} | |
return result; | |
}; | |
/* | |
* @description 从格式化词库里找到一个最长的匹配 | |
* @param {String} input 进行匹配的字符串 | |
* @param {int} pos 匹配字符串的匹配位置 | |
* @param {Object} dict,格式化词库 | |
* @return {String} 如果匹配成功,返回匹配的单词,否则返回空字符串 | |
*/ | |
var match_words = function (input, pos, dict) { | |
for (var i = 0; i < dict.length; i++) { | |
var word_group = dict[i]; | |
for (var j = 0; j < word_group.length; j++) { | |
var word = word_group[j]; | |
if (input.substr(pos, word.length) === word) | |
return word; | |
} | |
} | |
return ""; | |
} | |
/* | |
* 进行正向最大匹配机械分词 | |
*/ | |
String.prototype.SplitWords = function () { | |
var result = [], pos = 0, len = this.valueOf().length; | |
var formated_dict = format_dict(word_dict); | |
while (pos < len) { | |
var match_word = match_words(this.valueOf(), pos, formated_dict); | |
if (match_word.length > 0) { | |
result.push(match_word); | |
pos += match_word.length; | |
} | |
else { | |
result.push(this.valueOf()[pos]); | |
pos = pos + 1; | |
} | |
} | |
return result; | |
} | |
document.writeln("中华人民共和国是最伟大的国家".SplitWords().join()); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment