- 拼写错误
- 错别字
Levenshtein, n-gram
pg_trgm
select similarity('yue mian ke ji', 'yu mian ke ji')
fuzzystrmatch
SELECT levenshtein('yue mian ke ji', 'yu mian ke ji');
soundex, Metaphone
pg_similarity
fuzzymatch
select soundex('too'), soundex('two'),difference('too', 'two');
达观数据搜索引擎的Query自动纠错技术和架构详解
- 中文词语比较短,编辑距离的候选词太多
- 一般多是同音字 全城热恋 全城热炼 会租车 惠租车 薪人薪事
- 别名 阅面科技 阅面readface, 魔镜在线 魔镜online 魔镜科技,玻璃博士 玻璃doctor
- 查看是否有错
name like %query% exists
- 拼音 name_pinyin like %query_pinyin%
客机(ke ji) --> 奇艺科技(qi yi ke ji) --> extract 科技(ke ji) by pinyin
月面科技(yue mian ke ji) --> 阅面科技(yue mian ke ji) --> extract 阅面科技(yue mian ke ji) by pinyin
- 候选词太多,order by updated_at.
- 字典直接获取,smlar 的字典 table 不支持多余 column,重建 table 损耗比较大
- 拼音 similarity(name_pinyin, query_pinyin)
- 候选词太多,order by similarity
玉面科技(yu mian ke ji) -> 阅面科技(yue mian ke ji)
特赞(te zan) -> 特脏(te zang)
- 参数比较难控制, 默认0.3
- 分词权重匹配
- 候选词太多, 权重相似度排序
- 魔镜在线(魔镜 在线) 魔镜科技(魔镜 科技)
- zhparser, smlar
string -> parser -> tokens -> dictionary -> lexemes
- dictionary: simple, synonym, thesaurus, ispell
- parsers
select * from pg_ts_parser
- parser configuration
\dF
- token types
select * from ts_token_type('zhparser');
- map token to dictionary
ALTER TEXT SEARCH CONFIGURATION name ADD MAPPING FOR token_type [, ... ] WITH dictionary_name [, ... ]
- 常用的 function 和 operator
-
to_tsvector()
,ts_debug()
,to_tsquery()
,ts_parse()
,ts_lexize()
-
to_tsvector('fat cats ate rats') @@ to_tsquery('cat & rat')
-
一个完整的例子
SELECT ts_rank(array[0.1, 0.2, 0.4, 1.0], setweight(to_tsvector('zhparser','六国灭亡,秦始皇统一了天下。蜀山的树木被伐光了,阿房宫才盖起来。阿房宫占地三百多里,楼阁高耸,遮天蔽日。'),'A'), to_tsquery('zhparser','秦始皇 & 蜀山 & 阿旁宫'));
常见分词方法
- 字典 字串符匹配
- 统计 HMM
zhparser 底层调用 scws 提供的 lib, scws 基于词典的机械式分词,正向匹配,类似的还有 pg_jieba 调用 cppjieba 提供的 lib
-
安装 scws
参考官方文档 -
下载源码
wget http://www.xunsearch.com/scws/down/scws-1.2.3.tar.bz2
-
解压缩
tar xvjf scws-1.2.3.tar.bz2
-
编译
cd scws-1.2.3
./configure --prefix=/usr/local/scws #Mac需要sudo,或者可以换其他路径
make
make install
-
安装 zhparser
-
下载源码
git clone https://github.com/amutu/zhparser.git
-
安装zhparser
SCWS_HOME=/usr/local make && make install
-
配置
-
postgres.conf
include = 'zhparser.conf' include = 'smlar.conf'
-
zhparser.conf
zhparser.punctuation_ignore = t #忽略标点 zhparser.seg_with_duality = t #二元 zhparser.dict_in_memory = t zhparser.multi_short = f #短词 zhparser.multi_duality = t zhparser.multi_zmain = f #重要单字 zhparser.multi_zall = f #全部单字
-
chinese.stop
pg_config --sharedir
-
migration
if PostgresService.extension_exist('smlar', 'zhparser') enable_extension :zhparser execute <<-SQL CREATE TEXT SEARCH DICTIONARY simple_dict ( TEMPLATE = simple, STOPWORDS = chinese ); CREATE TEXT SEARCH CONFIGURATION zhparser (PARSER = zhparser); ALTER TEXT SEARCH CONFIGURATION zhparser ADD MAPPING FOR a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z WITH simple; SQL end
-
测试
SET zhparser.seg_with_duality TO f;
SET zhparser.multi_duality TO f;
SELECT * FROM ts_debug('zhparser','分词技术哪家强, 中国山东找蓝翔');
SELECT * FROM ts_debug('zhparser','阅面科技');
SET zhparser.seg_with_duality TO t;
SET zhparser.multi_duality TO t;
SELECT * FROM ts_debug('zhparser','扁担长板凳宽');
SELECT * FROM ts_debug('zhparser','阅面科技');
TF-IDF(term frequency, inverse document frequency), 包含 term t 的 document 数量越少, t 越有代表性
-
安装 smlar
git clone git://sigaev.ru/smlar.git cd smlar export USE_PGXS=1 make make install
-
配置 smlar
-
smlar.conf
smlar.persistent_cache true smlar.stattable 'company_name_corpuses' smlar.type 'tfidf' smlar.idf_plus_one true smlar.tf_method 'const' smlar.threshold 0.5
- 创建语料库
- 每一个 company.name 和 company.desc 最为一个 document
- 使用 zhparse 分词,记录到数据库中统计
- 测试
select smlar(tsvector2textarray(to_tsvector('zhparser','阅面科技')), tsvector2textarray(to_tsvector('zhparser','阅面 readface')));
select smlar(tsvector2textarray(to_tsvector('zhparser','阅面科技')), tsvector2textarray(to_tsvector('zhparser','烽火科技')));
select smlar(tsvector2textarray(to_tsvector('zhparser','阅面科技')), tsvector2textarray(to_tsvector('zhparser','芝麻科技')));
如果每次分词并且按照权重比较,查询会很耗费时间,所以还要加上对应的 index
-
like && ilike pg_trgm
drop index index_companies_on_name_pinyin; explain analyze select * from companies where name_pinyin like '%ke-ji%'; create index index_companies_on_name_pinyin on companies using gin(name_pinyin gin_trgm_ops); explain analyze select * from companies where name_pinyin like '%ke-ji%';
-
expression index
drop index index_sml_companies_on_name; explain analyze SELECT "companies"."name" FROM "companies" WHERE (tsvector2textarray(to_tsvector('zhparser',companies.name)) % array['readface','阅面']) ORDER BY smlar(tsvector2textarray(to_tsvector('zhparser',companies.name)),array['readface','阅面']); create index index_sml_companies_on_name_pinyin on companies using gin(tsvector2textarray(to_tsvector('zhparser'::regconfig, name::text)) _text_sml_ops); explain analyze SELECT "companies"."name" FROM "companies" WHERE (tsvector2textarray(to_tsvector('zhparser',companies.name)) % array['readface','阅面']) ORDER BY smlar(tsvector2textarray(to_tsvector('zhparser',companies.name)),array['readface','阅面']);