lopespm · January 25, 2021 19:23 · drcxd · Jan 25, 2021
diff --git a/levenshtein_regex_dp.py b/levenshtein_regex_dp.py
 # (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition)
 # The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression
 # *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported
 # This algorithm uses dynamic programming, yielding a O(mn) time complexity, O(m) auxiliary space for cache
 # (m and n are the lengths of regex and target strings respectively)
 #
 # Version using recursion with memoization: https://gist.github.com/lopespm/53a215d0b2b0518b52b6bb6687bdaff6

 def regex_dist(regex: str, target: str):

    current = [None] * (len(regex) + 1)
    
    for t_i in range(len(target) + 1):
        prev = [item for item in current]
        for r_i in range(len(regex) + 1):
            if (r_i == 0 and t_i == 0):
                current[r_i] = 0
            elif (r_i == 0):
                current[r_i] = t_i
            elif (t_i == 0):
                i, counter = r_i - 1, 0 
                while i >= 0:
                    char = regex[i]
                    i -= 2 if (char == '?' or char == '*' or char == '+') else 1
                    counter += 1 if (not (char == '?' or char == '*')) else 0
                current[r_i] = counter
                
            # Regex special cases
            elif (regex[r_i-1] == '.'):
                current[r_i] = prev[r_i - 1]

            elif (regex[r_i-1] == '+' or regex[r_i-1] == '*' or regex[r_i-1] == '?'):
                if (regex[r_i-1-1] == target[t_i-1]):
                    if (regex[r_i-1] == '?'):
                        current[r_i] = prev[r_i - 2]
                    else:
                        current[r_i] = min(prev[r_i - 2], prev[r_i])
                else:
                    additional_cost = 1 if (regex[r_i-1] == '+') else 0
                    current[r_i] = min(prev[r_i - 2] + 1,
                               prev[r_i] + 1,
                               current[r_i - 2] + additional_cost)

            # Other characters
            elif (regex[r_i-1] == target[t_i-1]):
                current[r_i] = prev[r_i - 1]
            else:
                current[r_i] = min(prev[r_i - 1] + 1,
                       prev[r_i] + 1,
                       current[r_i - 1] + 1)
    
    return current[-1]

    


 # Some examples
 print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA"))
 print(regex_dist("A+b", "AAb"))
 print(regex_dist("A+b", "AAAAAb"))
 print(regex_dist("A+b", "AAAAAb03b"))
 print(regex_dist("A..b", "AAAAAb03b"))
 print(regex_dist("q+", "A"))
 print(regex_dist("q+a?a+", "A"))
 print(regex_dist("q+a?a+A+", "A"))
 print(regex_dist("q+a?a+A+.", "A"))
 print(regex_dist("q+A", "AAAAAb03b"))
	# (Variant #4 for exercise 16.2 on EPI (Elements of Programming Interviews)) (September 2018 edition)
	# The core idea is calculate the levenshtein distance, while taking into account the special cases of the regex expression
	# *, +, ? and . were taken into account for the regex expression. Expression blocks are not supported
	# This algorithm uses dynamic programming, yielding a O(mn) time complexity, O(m) auxiliary space for cache
	# (m and n are the lengths of regex and target strings respectively)
	#
	# Version using recursion with memoization: https://gist.github.com/lopespm/53a215d0b2b0518b52b6bb6687bdaff6

	def regex_dist(regex: str, target: str):

	current = [None] * (len(regex) + 1)

	for t_i in range(len(target) + 1):
	prev = [item for item in current]
	for r_i in range(len(regex) + 1):
	if (r_i == 0 and t_i == 0):
	current[r_i] = 0
	elif (r_i == 0):
	current[r_i] = t_i
	elif (t_i == 0):
	i, counter = r_i - 1, 0
	while i >= 0:
	char = regex[i]
	i -= 2 if (char == '?' or char == '*' or char == '+') else 1
	counter += 1 if (not (char == '?' or char == '*')) else 0
	current[r_i] = counter

	# Regex special cases
	elif (regex[r_i-1] == '.'):
	current[r_i] = prev[r_i - 1]

	elif (regex[r_i-1] == '+' or regex[r_i-1] == '*' or regex[r_i-1] == '?'):
	if (regex[r_i-1-1] == target[t_i-1]):
	if (regex[r_i-1] == '?'):
	current[r_i] = prev[r_i - 2]
	else:
	current[r_i] = min(prev[r_i - 2], prev[r_i])
	else:
	additional_cost = 1 if (regex[r_i-1] == '+') else 0
	current[r_i] = min(prev[r_i - 2] + 1,
	prev[r_i] + 1,
	current[r_i - 2] + additional_cost)

	# Other characters
	elif (regex[r_i-1] == target[t_i-1]):
	current[r_i] = prev[r_i - 1]
	else:
	current[r_i] = min(prev[r_i - 1] + 1,
	prev[r_i] + 1,
	current[r_i - 1] + 1)

	return current[-1]




	# Some examples
	print(regex_dist("OrchestraaaQA+a", "CarthorseQAAAA"))
	print(regex_dist("A+b", "AAb"))
	print(regex_dist("A+b", "AAAAAb"))
	print(regex_dist("A+b", "AAAAAb03b"))
	print(regex_dist("A..b", "AAAAAb03b"))
	print(regex_dist("q+", "A"))
	print(regex_dist("q+a?a+", "A"))
	print(regex_dist("q+a?a+A+", "A"))
	print(regex_dist("q+a?a+A+.", "A"))
	print(regex_dist("q+A", "AAAAAb03b"))