Last active
September 11, 2018 13:17
-
-
Save atronah/e8de8ffbe22b9fabb4d65387884fae0c to your computer and use it in GitHub Desktop.
MySQL function for calculating Domerau-Levenshtein distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE FUNCTION `getDamerauLevenshtein`(s1 VARCHAR(128) CHARSET utf8, s2 VARCHAR(128) CHARSET utf8) RETURNS int(11) | |
DETERMINISTIC | |
COMMENT 'Returns Domerau-Levenshtein distance, i.e. minimal number of char operations (cut, paste, replace, transposition) for two string' | |
BEGIN | |
DECLARE resultDistance INT(11); | |
DECLARE currRow INT; -- current row of matrix (0 .. LENGTH(s1)) | |
DECLARE currColumn INT; -- current column of matrix (0 .. LENGTH(s2)) | |
DECLARE s1Length INT; -- first\left\s1 string length | |
DECLARE s2Length INT; -- second\right\s2 string length | |
DECLARE replaceDistance INT; -- current distance for replace operation | |
DECLARE insertDistance INT; -- current distance for insert operation | |
DECLARE deleteDistance INT; -- current distance for delete operation | |
DECLARE tmpString VARCHAR(128); -- for swapping values of string | |
DECLARE tmpLength VARCHAR(128); -- for swapping values of string length | |
DECLARE currDistsList VARCHAR(128); -- previous row of matrix in string format (column values separated by comma) | |
DECLARE prevDistsList VARCHaR(128); -- current row of matrix (column values separated by comma) | |
-- for equivalent strings number of operations is zero | |
IF s1 LIKE s2 THEN | |
RETURN 0; | |
END IF; | |
SET s1Length = CHAR_LENGTH(s1); | |
SET s2Length = CHAR_LENGTH(s2); | |
-- for empty string number of operations is equal number of characters other string | |
IF s1Length = 0 THEN | |
RETURN s2Length; | |
END IF; | |
-- for empty string number of operations is equal number of characters other string | |
IF s2Length = 0 THEN | |
RETURN s1Length; | |
END IF; | |
-- optimization | |
IF s1Length > s2Length THEN | |
SET tmpString = s1; | |
SET s1 = s2; | |
SET s2 = tmpString; | |
SET tmpLength = s1Length; | |
SET s1Length = s2Length; | |
SET s2Length = tmpLength; | |
END IF; | |
SET currColumn = 0; | |
-- first row of distance value matrix contain number from 0 to length second string | |
WHILE (currColumn <= s2Length) DO | |
SET prevDistsList = CONCAT_WS(',', prevDistsList, currColumn); | |
SET currColumn = currColumn + 1; | |
END WHILE; | |
SET currRow = 1; | |
-- for reference: | |
-- "SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1))" | |
-- equivalent | |
-- "prevDistsList[currColumn]" in python (type(prevDistsList) == list) | |
-- currColumn + 1 because start index for SUBSTRING_INDEX is 1 (but 0 as in standart list). | |
WHILE (currRow <= s1Length) DO | |
SET currDistsList = concat_ws(',', currRow); | |
SET currColumn = 1; | |
WHILE (currColumn <= s2Length) DO | |
-- if s1[row] = s2[column] then do not increment the counter operations (D[row, column] = D[row - 1, column -1]) | |
-- or if transposition detected | |
IF SUBSTRING(s1, currRow, 1) = SUBSTRING(s2, currColumn, 1) | |
OR (SUBSTRING(s1, currRow - 1, 1) = SUBSTRING(s2, currColumn, 1) | |
AND SUBSTRING(s1, currRow, 1) = SUBSTRING(s2, currColumn - 1, 1)) THEN | |
SET currDistsList = concat_ws(',', currDistsList, SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1) - 1), ',', -1)); | |
ELSE | |
-- previous distance for "replace" operation in D[row - 1, column -1] | |
SET replaceDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1) - 1), ',', -1) + 1; | |
-- previous distance for "insert" operation in D[row, column -1] | |
SET insertDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(currDistsList, ',', (currColumn + 1) - 1), ',', -1) + 1; | |
-- previous distance for "delete" operation in D[row - 1, column] | |
SET deleteDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1)), ',', -1) + 1; | |
SET currDistsList = concat_ws(',', currDistsList, LEAST(replaceDistance, insertDistance, deleteDistance)); | |
END IF; | |
SET currColumn = currColumn + 1; | |
END WHILE; | |
SET currRow = currRow + 1; | |
SET prevDistsList = currDistsList; | |
END WHILE; | |
SET resultDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(currDistsList, ',', s2Length + 1), ',', -1); | |
RETURN resultDistance; | |
END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment