Skip to content

Instantly share code, notes, and snippets.

@onlytiancai
Forked from onlytiancai/splid_words.html
Created August 30, 2011 03:54
Show Gist options
  • Save onlytiancai/1180139 to your computer and use it in GitHub Desktop.
Save onlytiancai/1180139 to your computer and use it in GitHub Desktop.
javascript实现的正向最大匹配机械分词
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>test</title>
</head>
<body>
<script type="text/javascript">
/*
* @description 原始词库
*/
var word_dict = ['中', '华', '人', '民', '人民', '共和', '国', '共和国','伟大','国家'];
/*
* @description 格式化词库,按长度分成若干组,并按长度倒序排列
* @param {Array of String} dict 全局词库
* @return {Object} 格式化后的词库
*/
var format_dict = function (dict) {
//1、把词库按长度分成若干组,key为词长度,value为此长度的词列表
var map = {};
for (var i = 0; i < dict.length; i++) {
var word = dict[i];
if (!map[word.length])
map[word.length] = [];
map[word.length].push(word);
}
//2、声明一个长度数组,按词的长度倒序排列
var len_array = [];
for (var len in map) {
len_array.push(len);
}
len_array.sort(function (a, b) { return parseInt(b) - parseInt(a) });
//3、声明一个词库数组,每个元素是一组等长词,且按长度倒序排列
var result = [];
for (var i = 0; i < len_array.length; i++) {
var key = len_array[i];
result.push(map[key]);
}
return result;
};
/*
* @description 从格式化词库里找到一个最长的匹配
* @param {String} input 进行匹配的字符串
* @param {int} pos 匹配字符串的匹配位置
* @param {Object} dict,格式化词库
* @return {String} 如果匹配成功,返回匹配的单词,否则返回空字符串
*/
var match_words = function (input, pos, dict) {
for (var i = 0; i < dict.length; i++) {
var word_group = dict[i];
for (var j = 0; j < word_group.length; j++) {
var word = word_group[j];
if (input.substr(pos, word.length) === word)
return word;
}
}
return "";
}
/*
* 进行正向最大匹配机械分词
*/
String.prototype.SplitWords = function () {
var result = [], pos = 0, len = this.valueOf().length;
var formated_dict = format_dict(word_dict);
while (pos < len) {
var match_word = match_words(this.valueOf(), pos, formated_dict);
if (match_word.length > 0) {
result.push(match_word);
pos += match_word.length;
}
else {
result.push(this.valueOf()[pos]);
pos = pos + 1;
}
}
return result;
}
document.writeln("中华人民共和国是最伟大的国家".SplitWords().join());
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment