Last active
December 28, 2021 09:25
-
-
Save lopespm/53a215d0b2b0518b52b6bb6687bdaff6 to your computer and use it in GitHub Desktop.
Levenshtein distance between regex expression and target string - Recursion with memoization (Python)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition) | |
# The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression | |
# *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported | |
# This algorithm uses recursion with memoization (could be transposed to a DP solution), yielding a O(mn) time complexity, O(mn) auxiliary space for cache and O(max(m,n)) function call stack | |
# (m and n are the lengths of regex and target strings respectively) | |
# | |
# Version using dynamic programming: https://gist.github.com/lopespm/2362a77e7bd230a4622a43709c195826 | |
def regex_dist(regex: str, target: str): | |
def regex_dist_aux(r_i, t_i): | |
if (r_i == -1 and t_i == -1): | |
return 0 | |
if (r_i == -1): | |
return t_i + 1 | |
if (t_i == -1): | |
i, counter = r_i, 0 | |
while i >= 0: | |
char = regex[i] | |
i -= 2 if (char == '?' or char == '*' or char == '+') else 1 | |
counter += 1 if (not (char == '?' or char == '*')) else 0 | |
return counter | |
if (memo[r_i][t_i] is not None): | |
return memo[r_i][t_i] | |
# Regex special cases | |
if (regex[r_i] == '.'): | |
memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1) | |
return memo[r_i][t_i] | |
if (regex[r_i] == '+' or regex[r_i] == '*' or regex[r_i] == '?'): | |
if (regex[r_i-1] == target[t_i]): | |
if (regex[r_i] == '?'): | |
memo[r_i][t_i] = regex_dist_aux(r_i - 2, t_i - 1) | |
else: | |
memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1), regex_dist_aux(r_i, t_i - 1)) | |
else: | |
additional_cost = 1 if (regex[r_i] == '+') else 0 | |
memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1) + 1, | |
regex_dist_aux(r_i, t_i - 1) + 1, | |
regex_dist_aux(r_i - 2, t_i) + additional_cost) | |
return memo[r_i][t_i] | |
# Other characters | |
if (regex[r_i] == target[t_i]): | |
memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1) | |
else: | |
memo[r_i][t_i] = min(regex_dist_aux(r_i - 1, t_i - 1) + 1, | |
regex_dist_aux(r_i, t_i - 1) + 1, | |
regex_dist_aux(r_i - 1, t_i) + 1) | |
return memo[r_i][t_i] | |
memo = [[None] * (len(target) + 1) for _ in range(len(regex) + 1)] | |
return regex_dist_aux(len(regex) - 1, len(target) -1) | |
# Some examples | |
print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA")) | |
print(regex_dist("A+b", "AAb")) | |
print(regex_dist("A+b", "AAAAAb")) | |
print(regex_dist("A+b", "AAAAAb03b")) | |
print(regex_dist("A..b", "AAAAAb03b")) | |
print(regex_dist("q+", "A")) | |
print(regex_dist("q+a?a+", "A")) | |
print(regex_dist("q+a?a+A+", "A")) | |
print(regex_dist("q+a?a+A+.", "A")) | |
print(regex_dist("q+A", "AAAAAb03b")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment