Last active
August 8, 2024 05:56
-
-
Save ciiiii/0e9f3ffcd1b33b087fc5d5b02bf72bce to your computer and use it in GitHub Desktop.
Postgresql for Chinese Full-Text Search.中文全文搜索
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If you don‘t want to build it youself, you can try `docker pull killercai/postgres`. | |
FROM healthcheck/postgres:latest | |
# China debian mirror | |
RUN sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list | |
RUN apt-get clean && apt-get update | |
RUN apt-get install -y wget git build-essential libpq-dev python-dev postgresql-server-dev-all | |
# SCWS (Simple Chinese Word Segmentation library) | |
RUN cd /tmp && wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.1.tar.bz2 | tar xjf - && cd scws-1.2.1 && ./configure && make install | |
# zhpaser (postgres plugin) | |
RUN cd /tmp && git clone https://github.com/amutu/zhparser.git && cd zhparser && make && make install |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- install extension | |
CREATE EXTENSION zhparser; | |
-- chinese word Segmentation config | |
CREATE TEXT SEARCH CONFIGURATION chinese_parser (PARSER = zhparser); | |
ALTER TEXT SEARCH CONFIGURATION chinese_parser ADD MAPPING FOR n,v,a,i,e,l,j WITH simple; | |
-- create test table | |
CREATE TABLE text_search( | |
text_id SERIAL PRIMARY KEY, | |
text_content TEXT, | |
tsv_column tsvector | |
); | |
-- create gin_index for speeding matching | |
CREATE INDEX idx_gin_tsv ON test_search USING GIN(tsv_column); | |
-- create trigger for synchronizing tsvector column from target | |
CREATE TRIGGER sync_trigger | |
BEFORE INSERT OR UPDATE ON test_search FOR EACH ROW | |
EXECUTE PROCEDURE | |
tsvector_update_trigger(tsv_column, 'public.chinese_parser', text_content); | |
-- insert some test data | |
INSERT INTO text_search(text_content) VALUES ('ThinkPad 小红点多功能蓝牙键盘多平台WIN安卓IOS支持 4X30K12182'); | |
INSERT INTO text_search(text_content) VALUES ('多平台 富文本 写作软件'); | |
-- search by some keyword | |
SELECT * FROM text_search WHERE tsv_column @@ to_tsquery('chinese_parser', '多平台'); | |
SELECT * FROM text_search WHERE tsv_column @@ to_tsquery('chinese_parser', '小红点&多平台'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment