Last active
August 29, 2015 14:08
-
-
Save claymcleod/f9ce092cfed501408ee6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Title: Edit Distance | |
# Authors: Clay McLeod | |
# Description: Python edit distance problem | |
# Section: Python | |
# Subsection: Interesting Problems | |
import sys | |
from tabulate import tabulate | |
def edit_distance(x, y): | |
"""--------------------------------------------------------------------- | |
> Documentation | |
Author: Clay McLeod | |
Course: CSCI 533 | |
Date: November 8, 2014 | |
Assignment: 2 | |
Method: edit_distance(x, y) | |
Description: Finds the shortest possible path between two strings using | |
dynamic programming techniques. | |
Input: 1) the starting word | |
2) the finishing word | |
Output: 1) the solution matrix of the dynamic programming technique | |
2) the corresponding operation matrix | |
NOTE: You will need to install the tabulate library for python to format | |
the output correctly. Tabulate is a library that will take a | |
matrix and format is correctly to Standard Out. You can install | |
tabulate by one of the following commands in your shell: | |
1) pip install tabulate | |
2) check the documentation at https://pypi.python.org/pypi/tabulate | |
Credit: | |
1) http://en.m.wikipedia.org/wiki/Levenshtein_distance helped clarify | |
how a dynamic solution looks for this problem. However, | |
all the code for this project was written by myself. | |
--------------------------------------------------------------------- | |
""" | |
# preferences | |
COPY_COST = 0 | |
TWIDDLE_COST = 1 | |
REPLACE_COST = 1 | |
INSERT_COST = 2 | |
DELETE_COST = 2 | |
solution_matrix = [[0 for i in range(0, len(y)+1)] for i in range(0, len(x)+1)] | |
operation_matrix = [['-' for i in range(0, len(y)+1)] for i in range(0, len(x)+1)] | |
for i in range(1, len(x)+1): | |
solution_matrix[i][0] = i | |
for j in range(1, len(y)+1): | |
solution_matrix[0][j] = j | |
# find length of input string | |
m = len(x) | |
n = len(y) | |
twiddles = [] | |
# loop | |
for j in range(1, n+1): | |
for i in range(1, m+1): | |
if (i, j) in twiddles: | |
solution_matrix[i][j] = solution_matrix[i-1][j-1] | |
operation_matrix[i][j] = 'T' | |
twiddles.remove((i, j)) | |
continue | |
elif x[i-1] == y[j-1]: | |
solution_matrix[i][j] = solution_matrix[i-1][j-1] + COPY_COST | |
else: | |
twiddle = solution_matrix[i-1][j-1] if i < m and j < n and x[i-1] == y[j] and x[i] == y[j-1] else 9999; | |
solution_matrix[i][j] = min([ | |
solution_matrix[i-1][j-1] + REPLACE_COST, | |
twiddle + TWIDDLE_COST, | |
solution_matrix[i-1][j] + DELETE_COST, | |
solution_matrix[i][j-1] + INSERT_COST, | |
]) | |
if twiddle != 9999: | |
twiddles.append((i+1, j+1)) | |
# populate operation matrix | |
cost = solution_matrix[i][j] | |
if cost == solution_matrix[i-1][j-1]: | |
operation_matrix[i][j] = 'C' | |
elif cost == twiddle + TWIDDLE_COST: | |
operation_matrix[i][j] = 'T' | |
elif cost == solution_matrix[i-1][j-1] + REPLACE_COST: | |
operation_matrix[i][j] = 'R' | |
elif cost == solution_matrix[i-1][j-1] + DELETE_COST: | |
operation_matrix[i][j] = 'D' | |
elif cost == solution_matrix[i-1][j-1] + INSERT_COST: | |
operation_matrix[i][j] = 'I' | |
return solution_matrix, operation_matrix | |
def move_to_results_array(move): | |
if move == "T": | |
return "Twiddle" | |
elif move == "C": | |
return "Copy" | |
elif move == "R": | |
return "Replace" | |
elif move == "I": | |
return "Insert" | |
elif move == "D": | |
return "Delete" | |
else: | |
raise RuntimeError("Unknown Input!") | |
# boilerplate | |
if __name__ == "__main__": | |
# print documentation | |
print edit_distance.__doc__ | |
# Input | |
original = list(str(raw_input('Enter is the original sequence: '))) | |
ending = list(str(raw_input('Enter is the ending sequence: '))) | |
# Calculation | |
solution_matrix, operation_matrix = edit_distance(original, ending) | |
print '' | |
# Output | |
print 'Solution Matrix:' | |
print tabulate(solution_matrix, tablefmt='plain') | |
print '' | |
print 'Operation Matrix:' | |
print tabulate(operation_matrix, tablefmt='plain') | |
# Messy solution for getting the order of operations, etc. | |
# Since this is not the real substance of the assignment, I | |
# wans't concerned about making it pretty. | |
i = 1 | |
j = 1 | |
results = [] | |
results.append(move_to_results_array(operation_matrix[i][j])) | |
while i < len(original) and j < len(ending): | |
next_i = i + 1 | |
next_j = j + 1 | |
best_move = 9999; | |
if best_move > operation_matrix[i][j+1]: | |
next_i = i | |
next_j = j | |
best_move = operation_matrix[i][j+1] | |
next_j = j + 1 | |
if best_move > operation_matrix[i+1][j]: | |
next_i = i | |
next_j = j | |
best_move =operation_matrix[i+1][j] | |
next_i = i + 1 | |
if best_move > operation_matrix[i+1][j+1]: | |
next_i = i | |
next_j = j | |
best_move =operation_matrix[i+1][j+1] | |
i = i + 1 | |
j = j + 1 | |
i = next_i | |
j = next_j | |
results.append(move_to_results_array(operation_matrix[i][j])) | |
i = 0 | |
print '' | |
print 'Solution steps:' | |
while i < len(results): | |
if results[i] == "Twiddle": | |
print '%d. %s | %s' % (i+1, results[i], "".join(ending[:i+2])) | |
del results[i] | |
else: | |
print '%d. %s | %s' % (i+1, results[i], "".join(ending[:i+1])) | |
i = i + 1 | |
print '%d. Kill | %s' % (i+1, "".join(ending)) | |
print '' | |
print 'Edit distance = %d' % solution_matrix[len(original)][len(ending)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--------------------------------------------------------------------- | |
> Documentation | |
Author: Clay McLeod | |
Course: CSCI 533 | |
Date: November 8, 2014 | |
Assignment: 2 | |
Method: edit_distance(x, y) | |
Description: Finds the shortest possible path between two strings using | |
dynamic programming techniques. | |
Input: 1) the starting word | |
2) the finishing word | |
Output: 1) the solution matrix of the dynamic programming technique | |
2) the corresponding operation matrix | |
NOTE: You will need to install the tabulate library for python to format | |
the output correctly. Tabulate is a library that will take a | |
matrix and format is correctly to Standard Out. You can install | |
tabulate by one of the following commands in your shell: | |
1) pip install tabulate | |
2) check the documentation at https://pypi.python.org/pypi/tabulate | |
--------------------------------------------------------------------- | |
Enter is the original sequence: hello | |
Enter is the ending sequence: helol | |
Solution Matrix: | |
0 1 2 3 4 5 | |
1 0 2 3 4 5 | |
2 2 0 2 4 5 | |
3 3 2 0 2 4 | |
4 4 4 2 1 2 | |
5 5 5 4 2 1 | |
Operation Matrix: | |
- - - - - - | |
- C R R R R | |
- R C C R R | |
- R C C C C | |
- R R C T C | |
- R R C C T | |
Solution steps: | |
1. Copy | h | |
2. Copy | he | |
3. Copy | hel | |
4. Twiddle | helol | |
5. Kill | helol | |
Edit distance = 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment