Created
July 10, 2011 17:45
-
-
Save kylebgorman/1074747 to your computer and use it in GitHub Desktop.
Demonstration of using the difflib built-in class in Python to compute approximately Levenshtein-optimal alignments, with examples from my past-tense learning experiments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# difflib_demo.py | |
# Kyle Gorman <[email protected]> | |
from difflib import SequenceMatcher | |
if __name__ == '__main__': | |
from sys import argv | |
for file in argv[1:]: | |
for line in open(file, 'r'): | |
(input, output) = line.rstrip().split() | |
q = SequenceMatcher(None, input, output) | |
x_string = [] | |
y_string = [] | |
change_string = [] | |
for (tag, i1, i2, j1, j2) in q.get_opcodes(): | |
if tag == 'insert': | |
change_len = j2 - j1 | |
x_string.append('*' * change_len) | |
y_string.append(output[j1:j2]) | |
change_string.append('v' * change_len) | |
elif tag == 'delete': | |
change_len = i2 - i1 | |
x_string.append(input[i1:i2]) | |
y_string.append('*' * change_len) | |
change_string.append('^' * change_len) | |
elif tag == 'replace': | |
i_len = i2 - i1 | |
diff_len = (j2 - j1) - (i_len) | |
x_string.append(input[i1:i2]) | |
y_string.append(output[j1:j2]) | |
if diff_len == 0: | |
# i think this case is the most common, hence this | |
# unorthodox arrangement of test cases | |
change_string.append('!' * i_len) | |
elif diff_len > 0: # j is longer than i | |
# in the next two cases, however, the sequence-matcher | |
# doesn't tell us how to align "replaced" sequences that | |
# aren't the same length on the input and output side. | |
# how should we take care of this? i can think of a few | |
# linguistic cases of interest. One is ablaut verbs in | |
# English, which are a mixed bag: | |
# | |
# laIt 'light' | |
# l It 'lit' | |
# | |
# wITst&nd 'withstand' | |
# wItstU d 'withstood' | |
# | |
# bIsitS 'beseech' | |
# besOt 'besought' | |
# | |
# 'beseech' should be fine. withstand and withstood are | |
# opposites. i suggest aligning actual material to the | |
# left, mark it all as '=', and call it a day. | |
x_string.append('*' * diff_len) | |
change_string.append('!' * i_len) | |
change_string.append('v' * diff_len) | |
else: # diff_len < 0: # i is longer than j | |
diff_len = -diff_len | |
y_string.append('*' * diff_len) | |
change_string.append('!' * (i_len - diff_len)) | |
change_string.append('^' * diff_len) | |
else: # tag == 'equal' | |
x_string.append(input[i1:i2]) | |
y_string.append(output[j1:j2]) | |
change_string.append('=' * (i2 - i1)) | |
print ''.join(x_string) | |
print ''.join(change_string) | |
print ''.join(y_string) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment