Created
June 2, 2023 11:15
-
-
Save Pigeo/058aafeb3c53d68dcd6ba4cab68e7421 to your computer and use it in GitHub Desktop.
Extension of PostgreSQL's websearch_to_tsquery() to add prefix_matching :* and more operators
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This function behaves similarly to PostgreSQL's websearch_to_tsquery() | |
-- but it also accepts prefix matching operator :* | |
-- and it uses the standard tsquery operators <-> & | ! instead of websearch_to_tsquery()'s operators 'OR' and '-' | |
-- and it accepts parenthesis for grouping the operands | |
-- see: https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES | |
CREATE OR REPLACE FUNCTION websearch_to_tsquery_with_prefix_matching(config regconfig, query_text text) | |
RETURNS tsquery AS $$ | |
DECLARE | |
tsq text COLLATE "C"; -- string functions such as regexp_replace() and substring() requires a deterministic collation. | |
tokens text[] COLLATE "C"; | |
i integer; | |
inside_quotes boolean := false; | |
parenthesis_level integer := 0; | |
BEGIN | |
tsq := query_text; -- IMPORTANT! This forces conversion to the deterministic collation specified above | |
tsq := regexp_replace(tsq, '(<->|&|!|[|]|[()]|")', ' \1 ', 'g'); -- isolate tsquery operators | |
tokens := regexp_split_to_array(tsq, '\s+'); -- tokenize words and tsquery operators | |
FOR i IN 1..array_length(tokens, 1) LOOP | |
CASE | |
WHEN tokens[i] = '"' THEN | |
inside_quotes := NOT inside_quotes; | |
IF inside_quotes THEN | |
tokens[i] := '<"'; | |
ELSE | |
tokens[i] := '">'; | |
END IF; | |
WHEN tokens[i] IN ('<->', '&', '!', '|') THEN | |
IF inside_quotes THEN | |
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does) | |
END IF; | |
-- ELSE standard tsquery operator => keep it unchanged. | |
WHEN tokens[i] = '(' THEN | |
IF inside_quotes THEN | |
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does) | |
ELSE | |
parenthesis_level := parenthesis_level + 1; | |
END IF; | |
WHEN tokens[i] = ')' THEN | |
IF inside_quotes THEN | |
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does) | |
ELSE | |
parenthesis_level := parenthesis_level - 1; | |
IF parenthesis_level < 0 THEN | |
parenthesis_level := 0; | |
tokens[i] := ''; -- delete the extraneous parenthesis to avoid a syntax error | |
END IF; | |
END IF; | |
WHEN substring(tokens[i] from length(tokens[i])-1) = ':*' THEN | |
IF length(tokens[i]) > 2 THEN | |
-- plain word (without operators), with prefix matching ":*": | |
tokens[i] := to_tsquery(config, quote_literal(REPLACE(tokens[i], ':*' , '')) || ':*'); | |
ELSE | |
-- prefix matching operator alone (not associated to a word prefix) => ignore it | |
tokens[i] := ''; | |
END IF; | |
ELSE | |
-- plain word, without prefix matching: | |
tokens[i] := plainto_tsquery(config, quote_literal(tokens[i])); | |
END CASE; | |
END LOOP; | |
tsq := array_to_string(tokens, ' '); | |
-- handle parenthesis: | |
tsq := regexp_replace(tsq, '(\()\s+|\s+(\))', '\1\2', 'g'); | |
FOR i IN 1..parenthesis_level LOOP | |
-- add the missing parenthesis to avoid a syntax error | |
tsq := tsq || ')'; | |
END LOOP; | |
-- replace spaces within double quotes with operator <-> (as websearch_to_tsquery() does): | |
tsq := regexp_replace(tsq, '(<")\s+|\s+(">)', '\1\2', 'g'); | |
tsq := regexp_replace(tsq, '<"([^"]*)\s+', '<"\1<->', 'g'); | |
tsq := regexp_replace(tsq, '<"|">', '', 'g'); -- Important: clean up all double quotes before applying the others regexp_replace() | |
-- filter out empty tokens (e.g. tool words like prepositions, articles, etc. that were removed by plainto_tsquery) and their operators: | |
tsq := regexp_replace(tsq, '(!)(\s{2,})', '\2', 'g'); -- removes 'NOT' operators followed by an empty token | |
tsq := regexp_replace(tsq, '(?:(\s{2,})<->)+|(?:<->(\s{2,}))+', '\1\2', 'g'); -- removes '<->' operators associated to an empty token | |
tsq := regexp_replace(tsq, '[|](\s*[|&])+|([|&]\s*)+[|]', ' | ', 'g'); -- operator '|' has priority over operator '&', typically when empty tokens are in-between. | |
tsq := regexp_replace(tsq, '&(\s*&+)+', '&', 'g'); -- removes extraneous 'AND' operators | |
-- tsquery operators' priority (only keep one operator between each word - the one with the highest priority): | |
tsq := regexp_replace(tsq, '(<->|[|]|&|\s)*(<->)(<->|[|]|&|\s)*', '\2', 'g'); | |
tsq := regexp_replace(tsq, '([|]|&|\s)*([|])([|]|&|\s)*', '\2', 'g'); | |
tsq := regexp_replace(tsq, '(&|\s)*([&])(&|\s)*', '\2', 'g'); | |
tsq := regexp_replace(tsq, '\s+', '&', 'g'); -- If not specified by the user, defaut logical operator between words is '&' | |
tsq := TRIM(BOTH ' |&' FROM tsq); -- removes remaining extraneous operators at end and beginning | |
RETURN tsq::tsquery; -- IMPORTANT: casting to tsquery must be explicitely declared otherwise the EXCEPTION won't be catched; | |
EXCEPTION WHEN others THEN | |
-- extra security, that guarantees that this function will never fail: | |
DECLARE | |
autocompletion boolean; | |
BEGIN | |
tsq := query_text; -- IMPORTANT! This forces conversion to the deterministic collation specified above | |
autocompletion := (regexp_matches(tsq , '[^\s|&!()<->]:*\s*$')); | |
tsq := REPLACE(tsq, ' OR ', ' "OR" '); | |
tsq := REPLACE(tsq, '|', ' OR '); | |
tsq := regexp_replace(tsq, '(\S*-\S*)', '"\1"'); | |
tsq := REPLACE(tsq, '!', '-'); | |
IF autocompletion THEN | |
RETURN to_tsquery(websearch_to_tsquery(config, tsq)::text || ':*'); | |
ELSE | |
RETURN websearch_to_tsquery(config, tsq); | |
END IF; | |
EXCEPTION WHEN others THEN | |
RETURN websearch_to_tsquery(config, tsq); | |
END; | |
END; | |
$$ LANGUAGE plpgsql IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL SAFE; | |
-- ADDITIONAL NOTES ON PERFORMANCE OPTIMIZATION: | |
-- ============================================= | |
-- You may find it useful to wrap above function within another one, | |
-- in order to avoid a performance penalty on very simple queries: | |
-- (but note that the wrapping function below is too naive and may occasionally give different | |
-- results than expected on certain simple queries containing special punctuation characters) | |
CREATE OR REPLACE FUNCTION web_to_tsquery(config regconfig, query_text text) | |
RETURNS tsquery AS $$ | |
BEGIN | |
return to_tsquery(config, query_text); | |
-- based on our tests, PostgreSQL's internal to_tsquery() is 2x to 3x times faster than our custom function above, | |
-- but it will fail every time user's query is badly formatted... | |
EXCEPTION WHEN others THEN | |
-- ... so, in those cases we switch to our slower but sturdier custom function: | |
return websearch_to_tsquery_with_prefix_matching(config, query_text); | |
-- NB: during our tests, we saw no visible performance penalty when trying | |
-- first to_tsquery(), then having an exception, then reverting to our custom function. | |
END; | |
$$ LANGUAGE plpgsql IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL SAFE; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment