Created
November 21, 2021 13:50
-
-
Save zw963/212654fc1180d39be1d818392c96bb68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
begin | |
require "bundler/inline" | |
rescue LoadError => e | |
$stderr.puts "Bundler version 1.10 or later is required. Please update your Bundler" | |
raise e | |
end | |
gemfile(true) do | |
source "https://rubygems.org" | |
gem 'sequel_pg', require: 'sequel' | |
gem 'pg' | |
gem 'awesome_print' | |
end | |
# start code here | |
# NOTICE: | |
# For accurately reproduce the problem, zhparser/scws had to be configured. | |
# wget -q -O - http://www.xunsearch.com/scws/down/scws-1.2.1.tar.bz2 | tar xjf - | |
# cd scws-1.2.3 ; ./configure && make && sudo env "PATH=$PATH" make install | |
# wget -q -O - https://github.com/amutu/zhparser/archive/refs/tags/V2.2.tar.gz |tar xzf - | |
# cd zhparser-2.2 ; make && sudo env "PATH=$PATH" make install | |
system("sudo -u postgres dropdb --if-exists check_sequel_db") | |
system("sudo -u postgres createdb check_sequel_db") | |
DB_URL="postgres://postgres:postgres@localhost:5432/check_sequel_db" | |
DB = Sequel.connect(DB_URL) | |
DB.create_table(:investing_latest_news, :ignore_index_errors=>true) do | |
primary_key :id | |
String :title, :text=>true, :null=>false | |
end | |
DB.run 'CREATE EXTENSION zhparser;' | |
DB.run "DROP TEXT SEARCH CONFIGURATION IF EXISTS zhparser;" | |
DB.run "CREATE TEXT SEARCH CONFIGURATION zhparser (PARSER = zhparser);" | |
DB.run "ALTER TEXT SEARCH CONFIGURATION zhparser ADD MAPPING FOR n,v,a,i,e,l,j,x WITH simple;" | |
DB.run "ALTER role all SET zhparser.multi_short=on;" # 1 短词复合 | |
DB.run "ALTER role all SET zhparser.multi_duality=on;" # 2 二元复合 | |
DB.run <<'HEREDOC' | |
ALTER TABLE investing_latest_news | |
ADD COLUMN textsearchable_index_col tsvector GENERATED ALWAYS | |
AS | |
( | |
to_tsvector( | |
'zhparser', | |
coalesce(title, '') | |
) | |
) | |
STORED; | |
HEREDOC | |
DB.run 'CREATE INDEX investing_latest_news_textsearch_idx_index ON investing_latest_news USING GIN (textsearchable_index_col);' | |
class InvestingLatestNews < Sequel::Model | |
end | |
InvestingLatestNews.create( | |
title: 'NYMEX原油仍有可能跌破78.85美元', | |
) | |
ap InvestingLatestNews.all | |
# [ | |
# [0] #<InvestingLatestNews:0x000055e892124528> { | |
# :id => 1, | |
# :textsearchable_index_col => "'78':12 '78.85':11 '85':13 'nymex':1 '原油':2 '可':6 '可能':5 '有':4 '油':3 '破':10 '能':7 '跌':9 '跌破':8", | |
# :title => "NYMEX原油仍有可能跌破78.85美元" | |
# } | |
# ] | |
# NOTICE: | |
# As we can see, after create above news, "美元",the english equivalent is US dollar, | |
# is not add into textsearchable_index_col, so we can't find out it when do full text search. | |
p InvestingLatestNews.where(Sequel.lit('textsearchable_index_col @@ to_tsquery(?)', '美元')).all | |
# => [] | |
# Add "美元" into user defined user-defined dictionary, and async it. | |
DB.run(Sequel.lit("INSERT INTO zhparser.zhprs_custom_word values(?) ON CONFLICT DO NOTHING;", '美元')) | |
DB.run("SELECT sync_zhprs_custom_word();") | |
# Update title=title, let stored generated column can pick it. | |
DB.run(Sequel.lit("update investing_latest_news set title=title")) | |
# Above code Not work, because textsearchable_index_col not updated. | |
p InvestingLatestNews.where(Sequel.lit('textsearchable_index_col @@ to_tsquery(?)', '美元')).all | |
# => [] | |
# [1] pry(main)> InvestingLatestNews.all | |
# [ | |
# #<InvestingLatestNews:0x000055aaaac39678> { | |
# :id => 1, | |
# :textsearchable_index_col => "'78':12 '78.85':11 '85':13 'nymex':1 '原油':2 '可':6 '可能':5 '有':4 '油':3 '破':10 '能':7 '跌':9 '跌破':8", | |
# :title => "NYMEX原油仍有可能跌破78.85美元" | |
# } | |
# ] | |
# Try again use pg directly. | |
db = PG.connect(URI(DB_URL)) | |
db.exec("UPDATE investing_latest_news SET title = title;") | |
# it works now. | |
p InvestingLatestNews.where(Sequel.lit('textsearchable_index_col @@ to_tsquery(?)', '美元')).all | |
# => [#<InvestingLatestNews @values={:id=>1,...}] | |
ap InvestingLatestNews.all | |
# [ | |
# [0] #<InvestingLatestNews:0x000055e891c9d388> { | |
# :id => 1, | |
# :textsearchable_index_col => "'78':12 '78.85':11 '85':13 'nymex':1 '原油':2 '可':6 '可能':5 '有':4 '油':3 '破':10 '美元':14 '能':7 '跌':9 '跌破':8", | |
# :title => "NYMEX原油仍有可能跌破78.85美元" | |
# } | |
# ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment