Last active
July 8, 2021 17:53
-
-
Save paul-english/3e75ceb1f450f5ff16516bda186fb952 to your computer and use it in GitHub Desktop.
Redshift Levenshtein UDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE FUNCTION levenshtein(s1 varchar, s2 varchar) RETURNS integer IMMUTABLE AS $$ | |
import numpy as np | |
# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python | |
def levenshtein(source, target): | |
source = source or "" | |
target = target or "" | |
if len(source) < len(target): | |
return levenshtein(target, source) | |
# So now we have len(source) >= len(target). | |
if len(target) == 0: | |
return len(source) | |
# We call tuple() to force strings to be used as sequences | |
# ('c', 'a', 't', 's') - numpy uses them as values by default. | |
source = np.array(tuple(source)) | |
target = np.array(tuple(target)) | |
# We use a dynamic programming algorithm, but with the | |
# added optimization that we only need the last two rows | |
# of the matrix. | |
previous_row = np.arange(target.size + 1) | |
for s in source: | |
# Insertion (target grows longer than source): | |
current_row = previous_row + 1 | |
# Substitution or matching: | |
# Target and source items are aligned, and either | |
# are different (cost of 1), or are the same (cost of 0). | |
current_row[1:] = np.minimum( | |
current_row[1:], | |
np.add(previous_row[:-1], target != s)) | |
# Deletion (target grows shorter than source): | |
current_row[1:] = np.minimum( | |
current_row[1:], | |
current_row[0:-1] + 1) | |
previous_row = current_row | |
return previous_row[-1] | |
return levenshtein(s1, s2) | |
$$ language plpythonu; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note that the source and target strings will be limited to 256 characters as that's the default when unspecified