Last active
December 28, 2015 01:29
-
-
Save zeroDivisible/7420615 to your computer and use it in GitHub Desktop.
Function using plpgsql which tokenizes given word. As an example, for input ``word`` it returns ``word wor ord``
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- returns word divided into smaller tokens, not shorter than value specified in | |
-- MINIMUM_LENGTH. | |
-- | |
-- calling this with param `bobcat` will return 'bobcat bob obc bca cat bobc obca bcat bobca obcat' | |
-- | |
-- Please note that changing this function, especially the minimum length which it uses | |
-- means that each INDEX which depends on it needs to be recreated | |
create or replace function tokenize(varchar, integer, integer) returns varchar as | |
$BODY$ declare | |
word alias for $1; -- alias for passed param | |
min_length alias for $2; -- only strings longer than this will be included in the output | |
max_length alias for $3; -- only strings shorter than this (inclusive) will be included | |
counter int; -- temp variable | |
strings varchar[]; -- identified substrings will be included here as they are found | |
tempstring varchar; -- temp value | |
tempwordlen int; -- temp value for word length | |
begin | |
-- add word which we are looking for to the result | |
strings := array_append(strings, word); | |
if length(word) <= min_length then | |
-- return if string is too short | |
return array_to_string(strings, ' '); | |
end if; | |
-- minimum extracted substring will be of this length | |
counter := min_length; | |
-- as long as current substring is not longer than current word | |
while counter < length(word) and counter <= max_length loop | |
-- find new substring length | |
tempwordlen := length(word) - counter + 1; | |
-- extract all substrings of given length | |
for i in 1..tempwordlen loop | |
tempstring := substr(word, i, counter); | |
-- ... if the are not already in the resulting array | |
if not(array[tempstring] <@ strings) then | |
strings := array_append(strings, tempstring); | |
end if; | |
end loop; | |
-- increase substring length by 1. | |
counter := counter + 1; | |
end loop; | |
-- return result | |
return array_to_string(strings, ' '); | |
end; | |
$BODY$ language 'plpgsql' immutable; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment