Skip to content

Instantly share code, notes, and snippets.

@lopespm
Last active December 28, 2021 09:25
Show Gist options
  • Save lopespm/53a215d0b2b0518b52b6bb6687bdaff6 to your computer and use it in GitHub Desktop.
Save lopespm/53a215d0b2b0518b52b6bb6687bdaff6 to your computer and use it in GitHub Desktop.
Levenshtein distance between regex expression and target string - Recursion with memoization (Python)
# (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition)
# The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression
# *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported
# This algorithm uses recursion with memoization (could be transposed to a DP solution), yielding a O(mn) time complexity, O(mn) auxiliary space for cache and O(max(m,n)) function call stack
# (m and n are the lengths of regex and target strings respectively)
#
# Version using dynamic programming: https://gist.github.com/lopespm/2362a77e7bd230a4622a43709c195826
def regex_dist(regex: str, target: str):
def regex_dist_aux(r_i, t_i):
if (r_i == -1 and t_i == -1):
return 0
if (r_i == -1):
return t_i + 1
if (t_i == -1):
i, counter = r_i, 0
while i >= 0:
char = regex[i]
i -= 2 if (char == '?' or char == '*' or char == '+') else 1
counter += 1 if (not (char == '?' or char == '*')) else 0
return counter
if (memo[r_i][t_i] is not None):
return memo[r_i][t_i]
# Regex special cases
if (regex[r_i] == '.'):
memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1)
return memo[r_i][t_i]
if (regex[r_i] == '+' or regex[r_i] == '*' or regex[r_i] == '?'):
if (regex[r_i-1] == target[t_i]):
if (regex[r_i] == '?'):
memo[r_i][t_i] = regex_dist_aux(r_i - 2, t_i - 1)
else:
memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1), regex_dist_aux(r_i, t_i - 1))
else:
additional_cost = 1 if (regex[r_i] == '+') else 0
memo[r_i][t_i] = min(regex_dist_aux(r_i - 2, t_i - 1) + 1,
regex_dist_aux(r_i, t_i - 1) + 1,
regex_dist_aux(r_i - 2, t_i) + additional_cost)
return memo[r_i][t_i]
# Other characters
if (regex[r_i] == target[t_i]):
memo[r_i][t_i] = regex_dist_aux(r_i - 1, t_i - 1)
else:
memo[r_i][t_i] = min(regex_dist_aux(r_i - 1, t_i - 1) + 1,
regex_dist_aux(r_i, t_i - 1) + 1,
regex_dist_aux(r_i - 1, t_i) + 1)
return memo[r_i][t_i]
memo = [[None] * (len(target) + 1) for _ in range(len(regex) + 1)]
return regex_dist_aux(len(regex) - 1, len(target) -1)
# Some examples
print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA"))
print(regex_dist("A+b", "AAb"))
print(regex_dist("A+b", "AAAAAb"))
print(regex_dist("A+b", "AAAAAb03b"))
print(regex_dist("A..b", "AAAAAb03b"))
print(regex_dist("q+", "A"))
print(regex_dist("q+a?a+", "A"))
print(regex_dist("q+a?a+A+", "A"))
print(regex_dist("q+a?a+A+.", "A"))
print(regex_dist("q+A", "AAAAAb03b"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment