Last active
October 22, 2025 09:16
-
-
Save guilherme-miyake/c32052c8f28a7c1591f5b319504cf4fa to your computer and use it in GitHub Desktop.
Levenshtein Distance Options
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Refactor of: https://medium.com/@hebozhe/optimizing-levenshtein-distance-in-python-using-built-ins-only-8f6b37e9de89 | |
| def calculate_distance_to_last_position(matching_positions: list[tuple[int, int]]) -> int: | |
| ''' | |
| summary: Calculate the minimum distance from (0, 0) to the end of | |
| a list of Levenshtein coordinates. | |
| params: | |
| coords: list[tuple[int, int]] of (row, column) coordinates where | |
| the characters of two strings were identical. | |
| return: int of the Levenshtein distance. | |
| ex: calculate_distance_to_last_position([(1, 1), (2, 5), (4, 2), | |
| (6, 4), (7, 5), (8, 6), (9, 7)]) -> 3 | |
| {(1, 1): 0, (2, 5): 4, (4, 2): 3, (6, 4): 5, (7, 5): 6, | |
| (8, 6): 7, (9, 7): 8} | |
| {(2, 5): 3, (4, 2): 2, (6, 4): 4, (7, 5): 5, (8, 6): 6, (9, 7): 7} | |
| {(4, 2): 2, (6, 4): 4, (7, 5): 5, (8, 6): 6, (9, 7): 7} | |
| {(6, 4): 3, (7, 5): 4, (8, 6): 5, (9, 7): 6} | |
| {(7, 5): 3, (8, 6): 4, (9, 7): 5} | |
| {(8, 6): 3, (9, 7): 4} | |
| {(9, 7): 3} | |
| distances[final_position] == 3 | |
| chars: 58 + 733 = 791 | |
| ''' | |
| distances = {position: max(position) for position in matching_positions} | |
| final_position = matching_positions[-1] | |
| current_min_distance = distances[final_position] | |
| matching_position_discount = -1 | |
| for reference in matching_positions[:-1]: | |
| (reference_x, reference_y), reference_distance = reference, distances.pop(reference) | |
| if reference_distance >= current_min_distance: | |
| continue | |
| positions_ahead = { | |
| (position_x, position_y): position_distance | |
| for (position_x, position_y), position_distance in distances.items() | |
| if position_x > reference_x and position_y > reference_y and position_distance > reference_distance | |
| } | |
| distances_through_reference = { | |
| (position_x, position_y): | |
| min(position_distance, (reference_distance + matching_position_discount + max(position_x - reference_x, position_y - reference_y))) | |
| for (position_x, position_y), position_distance in positions_ahead.items() | |
| } | |
| distances |= distances_through_reference | |
| current_min_distance = min(distances[final_position], current_min_distance) | |
| if min(distances.values()) == current_min_distance: | |
| return current_min_distance | |
| return distances[final_position] | |
| def levshtein_distance_by_matching_positions(of_str_a: str, of_str_b: str) -> int: | |
| ''' | |
| summary: Get the Levenshtein distance of two strings by finding | |
| an optimal path from a list of character-matching coordinates. | |
| params: | |
| of_str_a: str used to measure Levenshtein distance. | |
| of_str_b: str used to measure Levenshtein distance. | |
| return: int of the Levenshtein distance between of_str_a and of_str_b. | |
| chars: 60 + 404 = 464 | |
| ''' | |
| matching_positions = [(index_a, index_b) | |
| for index_a, char_a in enumerate(of_str_a) | |
| for index_b, char_b in enumerate(of_str_b) | |
| if char_a == char_b] | |
| matching_positions.append((len(of_str_a), len(of_str_b))) | |
| return calculate_distance_to_last_position(matching_positions=matching_positions) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @lru_cache(maxsize=5000) | |
| def edit_distance(a: str, b: str): | |
| return recursive_edit_distance(a, b) | |
| @lru_cache(maxsize=5000) | |
| def recursive_edit_distance(a: str, b: str, distance: int = 0): | |
| # no more characters in one substring | |
| if len(a) == 0 or len(b) == 0: | |
| return len(a) + len(b) + distance | |
| # no change required | |
| elif a[0] == b[0]: | |
| return recursive_edit_distance(a[1:], b[1:], distance) | |
| # change detected | |
| return min( | |
| recursive_edit_distance(a, b[1:], distance + 1), # insert character | |
| recursive_edit_distance(a[1:], b, distance + 1), # delete character | |
| recursive_edit_distance(a[1:], b[1:], distance + 1), # replace character | |
| ) | |
| def a_star_edit_distance(starting_a: str, starting_b: str): | |
| max_distance = max(len(starting_a), len(starting_b)) | |
| next_step = (max_distance, 0, starting_a, starting_b) | |
| priority_heap = [] | |
| def a_star_reduce_distance(current_distance, steps, substring_a, substring_b): | |
| if current_distance + steps - max(len(starting_a), len(starting_b)) > 2: | |
| return None | |
| if len(substring_a) == 0 or len(substring_b) == 0: | |
| return current_distance | |
| elif substring_a == substring_b: | |
| return current_distance - len(substring_a) | |
| elif substring_a[0] == substring_b[0]: | |
| next_step = heapq.heappushpop( | |
| priority_heap, | |
| (current_distance - 1, steps, substring_a[1:], substring_b[1:]) # no change required | |
| ) | |
| return next_step | |
| else: | |
| heapq.heappush( | |
| priority_heap, | |
| (current_distance, steps + 1, substring_a, substring_b[1:]) # skip 1 b = insert character | |
| ) | |
| heapq.heappush( | |
| priority_heap, | |
| (current_distance, steps + 1, substring_a[1:], substring_b) # delete character | |
| ) | |
| next_step = heapq.heappushpop( | |
| priority_heap, | |
| (current_distance, steps + 1, substring_a[1:], substring_b[1:]) # replace character | |
| ) | |
| return next_step | |
| while True: | |
| if isinstance(next_step, int): | |
| return next_step | |
| next_step = a_star_reduce_distance(*next_step) | |
| class BaseCoordinates: | |
| x: int | |
| y: int | |
| distance_from_origin: int | |
| class StringCoordinates(BaseCoordinates): | |
| def __init__(self, x, y): | |
| self.x = x | |
| self.y = y | |
| self.distance_from_origin = max(x, y) - 1 | |
| def update_distance_using_reference(self, reference: BaseCoordinates): | |
| x_distance = self.x - reference.x | |
| y_distance = self.y - reference.y | |
| distance_from_reference = max(x_distance, y_distance) | |
| # for string positions, we need subtract 1 to find edit_distance | |
| distance_through_reference = reference.distance_from_origin + distance_from_reference - 1 | |
| if self.distance_from_origin > distance_through_reference: | |
| self.distance_from_origin = distance_through_reference | |
| return self | |
| def __gt__(self, other: BaseCoordinates): | |
| return self.x > other.x and self.y > other.y and self.distance_from_origin > other.distance_from_origin | |
| def calculate_distance_to_last_coordinate_2(matching_positions: list[StringCoordinates]) -> int: | |
| ''' | |
| summary: Calculate the minimum distance from (0, 0) to the end of | |
| a list of Levenshtein coordinates. | |
| params: | |
| coords: list[tuple[int, int]] of (row, column) coordinates where | |
| the characters of two strings were identical. | |
| return: int of the Levenshtein distance. | |
| ex: calculate_distance_to_last_position([(1, 1), (2, 5), (4, 2), | |
| (6, 4), (7, 5), (8, 6), (9, 7)]) -> 3 | |
| {(1, 1): 0, (2, 5): 4, (4, 2): 3, (6, 4): 5, (7, 5): 6, | |
| (8, 6): 7, (9, 7): 8} | |
| {(2, 5): 3, (4, 2): 2, (6, 4): 4, (7, 5): 5, (8, 6): 6, (9, 7): 7} | |
| {(4, 2): 2, (6, 4): 4, (7, 5): 5, (8, 6): 6, (9, 7): 7} | |
| {(6, 4): 3, (7, 5): 4, (8, 6): 5, (9, 7): 6} | |
| {(7, 5): 3, (8, 6): 4, (9, 7): 5} | |
| {(8, 6): 3, (9, 7): 4} | |
| {(9, 7): 3} | |
| distances[final_position] == 3 | |
| chars: 58 + 733 = 791 | |
| ''' | |
| distances = [position for position in matching_positions] | |
| final_position = matching_positions[-1] | |
| current_min_distance = final_position.distance_from_origin | |
| for position in range(len(matching_positions)): | |
| reference = matching_positions[position] | |
| if reference.distance_from_origin >= current_min_distance: | |
| continue | |
| new_distances = [ | |
| position.update_distance_using_reference(reference).distance_from_origin | |
| for position in distances if position > reference | |
| ] | |
| current_min_distance = min(final_position.distance_from_origin, current_min_distance) | |
| if min(new_distances) == current_min_distance: | |
| return current_min_distance | |
| return final_position.distance_from_origin |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment