Skip to content

Instantly share code, notes, and snippets.

@Pigeo
Created June 2, 2023 11:15
Show Gist options
  • Save Pigeo/058aafeb3c53d68dcd6ba4cab68e7421 to your computer and use it in GitHub Desktop.
Save Pigeo/058aafeb3c53d68dcd6ba4cab68e7421 to your computer and use it in GitHub Desktop.
Extension of PostgreSQL's websearch_to_tsquery() to add prefix_matching :* and more operators
-- This function behaves similarly to PostgreSQL's websearch_to_tsquery()
-- but it also accepts prefix matching operator :*
-- and it uses the standard tsquery operators <-> & | ! instead of websearch_to_tsquery()'s operators 'OR' and '-'
-- and it accepts parenthesis for grouping the operands
-- see: https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
CREATE OR REPLACE FUNCTION websearch_to_tsquery_with_prefix_matching(config regconfig, query_text text)
RETURNS tsquery AS $$
DECLARE
tsq text COLLATE "C"; -- string functions such as regexp_replace() and substring() requires a deterministic collation.
tokens text[] COLLATE "C";
i integer;
inside_quotes boolean := false;
parenthesis_level integer := 0;
BEGIN
tsq := query_text; -- IMPORTANT! This forces conversion to the deterministic collation specified above
tsq := regexp_replace(tsq, '(<->|&|!|[|]|[()]|")', ' \1 ', 'g'); -- isolate tsquery operators
tokens := regexp_split_to_array(tsq, '\s+'); -- tokenize words and tsquery operators
FOR i IN 1..array_length(tokens, 1) LOOP
CASE
WHEN tokens[i] = '"' THEN
inside_quotes := NOT inside_quotes;
IF inside_quotes THEN
tokens[i] := '<"';
ELSE
tokens[i] := '">';
END IF;
WHEN tokens[i] IN ('<->', '&', '!', '|') THEN
IF inside_quotes THEN
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does)
END IF;
-- ELSE standard tsquery operator => keep it unchanged.
WHEN tokens[i] = '(' THEN
IF inside_quotes THEN
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does)
ELSE
parenthesis_level := parenthesis_level + 1;
END IF;
WHEN tokens[i] = ')' THEN
IF inside_quotes THEN
tokens[i] := '<->'; -- ignore any tsquery operators when inside double quotes, and replace them with <-> (as websearch_to_tsquery() does)
ELSE
parenthesis_level := parenthesis_level - 1;
IF parenthesis_level < 0 THEN
parenthesis_level := 0;
tokens[i] := ''; -- delete the extraneous parenthesis to avoid a syntax error
END IF;
END IF;
WHEN substring(tokens[i] from length(tokens[i])-1) = ':*' THEN
IF length(tokens[i]) > 2 THEN
-- plain word (without operators), with prefix matching ":*":
tokens[i] := to_tsquery(config, quote_literal(REPLACE(tokens[i], ':*' , '')) || ':*');
ELSE
-- prefix matching operator alone (not associated to a word prefix) => ignore it
tokens[i] := '';
END IF;
ELSE
-- plain word, without prefix matching:
tokens[i] := plainto_tsquery(config, quote_literal(tokens[i]));
END CASE;
END LOOP;
tsq := array_to_string(tokens, ' ');
-- handle parenthesis:
tsq := regexp_replace(tsq, '(\()\s+|\s+(\))', '\1\2', 'g');
FOR i IN 1..parenthesis_level LOOP
-- add the missing parenthesis to avoid a syntax error
tsq := tsq || ')';
END LOOP;
-- replace spaces within double quotes with operator <-> (as websearch_to_tsquery() does):
tsq := regexp_replace(tsq, '(<")\s+|\s+(">)', '\1\2', 'g');
tsq := regexp_replace(tsq, '<"([^"]*)\s+', '<"\1<->', 'g');
tsq := regexp_replace(tsq, '<"|">', '', 'g'); -- Important: clean up all double quotes before applying the others regexp_replace()
-- filter out empty tokens (e.g. tool words like prepositions, articles, etc. that were removed by plainto_tsquery) and their operators:
tsq := regexp_replace(tsq, '(!)(\s{2,})', '\2', 'g'); -- removes 'NOT' operators followed by an empty token
tsq := regexp_replace(tsq, '(?:(\s{2,})<->)+|(?:<->(\s{2,}))+', '\1\2', 'g'); -- removes '<->' operators associated to an empty token
tsq := regexp_replace(tsq, '[|](\s*[|&])+|([|&]\s*)+[|]', ' | ', 'g'); -- operator '|' has priority over operator '&', typically when empty tokens are in-between.
tsq := regexp_replace(tsq, '&(\s*&+)+', '&', 'g'); -- removes extraneous 'AND' operators
-- tsquery operators' priority (only keep one operator between each word - the one with the highest priority):
tsq := regexp_replace(tsq, '(<->|[|]|&|\s)*(<->)(<->|[|]|&|\s)*', '\2', 'g');
tsq := regexp_replace(tsq, '([|]|&|\s)*([|])([|]|&|\s)*', '\2', 'g');
tsq := regexp_replace(tsq, '(&|\s)*([&])(&|\s)*', '\2', 'g');
tsq := regexp_replace(tsq, '\s+', '&', 'g'); -- If not specified by the user, defaut logical operator between words is '&'
tsq := TRIM(BOTH ' |&' FROM tsq); -- removes remaining extraneous operators at end and beginning
RETURN tsq::tsquery; -- IMPORTANT: casting to tsquery must be explicitely declared otherwise the EXCEPTION won't be catched;
EXCEPTION WHEN others THEN
-- extra security, that guarantees that this function will never fail:
DECLARE
autocompletion boolean;
BEGIN
tsq := query_text; -- IMPORTANT! This forces conversion to the deterministic collation specified above
autocompletion := (regexp_matches(tsq , '[^\s|&!()<->]:*\s*$'));
tsq := REPLACE(tsq, ' OR ', ' "OR" ');
tsq := REPLACE(tsq, '|', ' OR ');
tsq := regexp_replace(tsq, '(\S*-\S*)', '"\1"');
tsq := REPLACE(tsq, '!', '-');
IF autocompletion THEN
RETURN to_tsquery(websearch_to_tsquery(config, tsq)::text || ':*');
ELSE
RETURN websearch_to_tsquery(config, tsq);
END IF;
EXCEPTION WHEN others THEN
RETURN websearch_to_tsquery(config, tsq);
END;
END;
$$ LANGUAGE plpgsql IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL SAFE;
-- ADDITIONAL NOTES ON PERFORMANCE OPTIMIZATION:
-- =============================================
-- You may find it useful to wrap above function within another one,
-- in order to avoid a performance penalty on very simple queries:
-- (but note that the wrapping function below is too naive and may occasionally give different
-- results than expected on certain simple queries containing special punctuation characters)
CREATE OR REPLACE FUNCTION web_to_tsquery(config regconfig, query_text text)
RETURNS tsquery AS $$
BEGIN
return to_tsquery(config, query_text);
-- based on our tests, PostgreSQL's internal to_tsquery() is 2x to 3x times faster than our custom function above,
-- but it will fail every time user's query is badly formatted...
EXCEPTION WHEN others THEN
-- ... so, in those cases we switch to our slower but sturdier custom function:
return websearch_to_tsquery_with_prefix_matching(config, query_text);
-- NB: during our tests, we saw no visible performance penalty when trying
-- first to_tsquery(), then having an exception, then reverting to our custom function.
END;
$$ LANGUAGE plpgsql IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL SAFE;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment