Skip to content

Instantly share code, notes, and snippets.

@zeroDivisible
Last active December 28, 2015 01:29
Show Gist options
  • Save zeroDivisible/7420615 to your computer and use it in GitHub Desktop.
Save zeroDivisible/7420615 to your computer and use it in GitHub Desktop.
Function using plpgsql which tokenizes given word. As an example, for input ``word`` it returns ``word wor ord``
-- returns word divided into smaller tokens, not shorter than value specified in
-- MINIMUM_LENGTH.
--
-- calling this with param `bobcat` will return 'bobcat bob obc bca cat bobc obca bcat bobca obcat'
--
-- Please note that changing this function, especially the minimum length which it uses
-- means that each INDEX which depends on it needs to be recreated
create or replace function tokenize(varchar, integer, integer) returns varchar as
$BODY$ declare
word alias for $1; -- alias for passed param
min_length alias for $2; -- only strings longer than this will be included in the output
max_length alias for $3; -- only strings shorter than this (inclusive) will be included
counter int; -- temp variable
strings varchar[]; -- identified substrings will be included here as they are found
tempstring varchar; -- temp value
tempwordlen int; -- temp value for word length
begin
-- add word which we are looking for to the result
strings := array_append(strings, word);
if length(word) <= min_length then
-- return if string is too short
return array_to_string(strings, ' ');
end if;
-- minimum extracted substring will be of this length
counter := min_length;
-- as long as current substring is not longer than current word
while counter < length(word) and counter <= max_length loop
-- find new substring length
tempwordlen := length(word) - counter + 1;
-- extract all substrings of given length
for i in 1..tempwordlen loop
tempstring := substr(word, i, counter);
-- ... if the are not already in the resulting array
if not(array[tempstring] <@ strings) then
strings := array_append(strings, tempstring);
end if;
end loop;
-- increase substring length by 1.
counter := counter + 1;
end loop;
-- return result
return array_to_string(strings, ' ');
end;
$BODY$ language 'plpgsql' immutable;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment