Last active
January 25, 2021 19:23
-
-
Save lopespm/2362a77e7bd230a4622a43709c195826 to your computer and use it in GitHub Desktop.
Levenshtein distance between regex expression and target string - Dynamic Programming (Python)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition) | |
# The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression | |
# *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported | |
# This algorithm uses dynamic programming, yielding a O(mn) time complexity, O(m) auxiliary space for cache | |
# (m and n are the lengths of regex and target strings respectively) | |
# | |
# Version using recursion with memoization: https://gist.github.com/lopespm/53a215d0b2b0518b52b6bb6687bdaff6 | |
def regex_dist(regex: str, target: str): | |
current = [None] * (len(regex) + 1) | |
for t_i in range(len(target) + 1): | |
prev = [item for item in current] | |
for r_i in range(len(regex) + 1): | |
if (r_i == 0 and t_i == 0): | |
current[r_i] = 0 | |
elif (r_i == 0): | |
current[r_i] = t_i | |
elif (t_i == 0): | |
i, counter = r_i - 1, 0 | |
while i >= 0: | |
char = regex[i] | |
i -= 2 if (char == '?' or char == '*' or char == '+') else 1 | |
counter += 1 if (not (char == '?' or char == '*')) else 0 | |
current[r_i] = counter | |
# Regex special cases | |
elif (regex[r_i-1] == '.'): | |
current[r_i] = prev[r_i - 1] | |
elif (regex[r_i-1] == '+' or regex[r_i-1] == '*' or regex[r_i-1] == '?'): | |
if (regex[r_i-1-1] == target[t_i-1]): | |
if (regex[r_i-1] == '?'): | |
current[r_i] = prev[r_i - 2] | |
else: | |
current[r_i] = min(prev[r_i - 2], prev[r_i]) | |
else: | |
additional_cost = 1 if (regex[r_i-1] == '+') else 0 | |
current[r_i] = min(prev[r_i - 2] + 1, | |
prev[r_i] + 1, | |
current[r_i - 2] + additional_cost) | |
# Other characters | |
elif (regex[r_i-1] == target[t_i-1]): | |
current[r_i] = prev[r_i - 1] | |
else: | |
current[r_i] = min(prev[r_i - 1] + 1, | |
prev[r_i] + 1, | |
current[r_i - 1] + 1) | |
return current[-1] | |
# Some examples | |
print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA")) | |
print(regex_dist("A+b", "AAb")) | |
print(regex_dist("A+b", "AAAAAb")) | |
print(regex_dist("A+b", "AAAAAb03b")) | |
print(regex_dist("A..b", "AAAAAb03b")) | |
print(regex_dist("q+", "A")) | |
print(regex_dist("q+a?a+", "A")) | |
print(regex_dist("q+a?a+A+", "A")) | |
print(regex_dist("q+a?a+A+.", "A")) | |
print(regex_dist("q+A", "AAAAAb03b")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
lopespm, I think your algorithm is incorrect. For example, consider the regular expression is "aa?" and the plain string is "a". The minimal Levenshtein distance should be 0, since "aa?" could represent "aa" and "a", and the latter is exactly the same as the plain string. However, the output of your program is 1.