Skip to content

Instantly share code, notes, and snippets.

@ciiiii
Last active August 8, 2024 05:56
Show Gist options
  • Save ciiiii/0e9f3ffcd1b33b087fc5d5b02bf72bce to your computer and use it in GitHub Desktop.
Save ciiiii/0e9f3ffcd1b33b087fc5d5b02bf72bce to your computer and use it in GitHub Desktop.
Postgresql for Chinese Full-Text Search.中文全文搜索
# If you don‘t want to build it youself, you can try `docker pull killercai/postgres`.
FROM healthcheck/postgres:latest
# China debian mirror
RUN sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list
RUN apt-get clean && apt-get update
RUN apt-get install -y wget git build-essential libpq-dev python-dev postgresql-server-dev-all
# SCWS (Simple Chinese Word Segmentation library)
RUN cd /tmp && wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.1.tar.bz2 | tar xjf - && cd scws-1.2.1 && ./configure && make install
# zhpaser (postgres plugin)
RUN cd /tmp && git clone https://github.com/amutu/zhparser.git && cd zhparser && make && make install
-- install extension
CREATE EXTENSION zhparser;
-- chinese word Segmentation config
CREATE TEXT SEARCH CONFIGURATION chinese_parser (PARSER = zhparser);
ALTER TEXT SEARCH CONFIGURATION chinese_parser ADD MAPPING FOR n,v,a,i,e,l,j WITH simple;
-- create test table
CREATE TABLE text_search(
text_id SERIAL PRIMARY KEY,
text_content TEXT,
tsv_column tsvector
);
-- create gin_index for speeding matching
CREATE INDEX idx_gin_tsv ON test_search USING GIN(tsv_column);
-- create trigger for synchronizing tsvector column from target
CREATE TRIGGER sync_trigger
BEFORE INSERT OR UPDATE ON test_search FOR EACH ROW
EXECUTE PROCEDURE
tsvector_update_trigger(tsv_column, 'public.chinese_parser', text_content);
-- insert some test data
INSERT INTO text_search(text_content) VALUES ('ThinkPad 小红点多功能蓝牙键盘多平台WIN安卓IOS支持 4X30K12182');
INSERT INTO text_search(text_content) VALUES ('多平台 富文本 写作软件');
-- search by some keyword
SELECT * FROM text_search WHERE tsv_column @@ to_tsquery('chinese_parser', '多平台');
SELECT * FROM text_search WHERE tsv_column @@ to_tsquery('chinese_parser', '小红点&多平台');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment