Last active
March 19, 2017 21:51
-
-
Save SeijiEmery/fdf251f015f77cbb7127289b6856ba24 to your computer and use it in GitHub Desktop.
Python levenshtein distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Note: based on levenshtein2 from https://gist.github.com/SeijiEmery/8ea00d3a33b00a74305c | |
| # which is itself a simple implementation of levenshtein edit distance w/ dynamic programming | |
| # and 2 rows. https://en.wikipedia.org/wiki/Levenshtein_distance | |
| # (Note: the java version was a CS problem in a java class I took a while ago, so no, I didn't | |
| # just copy this from wikipedia, though you might as well since it's literally the same algorithm). | |
| def lev (a, b): | |
| n, m = len(a), len(b) | |
| row, prev = [0] * (n + 1), [0] * (n + 1) | |
| for i in range(n): | |
| prev[i] = i | |
| for j in range(m): | |
| x = j | |
| for i in range(n): | |
| x = row[i + 1] = min( | |
| min(x, prev[i + 1]) + 1, | |
| prev[i] + 1 if a[i] != b[j] else 0) | |
| row, prev = prev, row | |
| return prev[n] | |
| def fuzzy_match_levenshtein (items): | |
| def match (word): | |
| return sorted([ (lev(word, item), item) for item in items ], | |
| key = lambda pair: pair[0], | |
| reverse = False)[0][1] | |
| return match | |
| if __name__ == '__main__': | |
| assert(fuzzy_match_levenshtein(["cat", "kitten", "cats"])("kitn") == "kitten") | |
| assert(fuzzy_match_levenshtein(["cat", "kitten", "cats"])("cts") == "cats") | |
| assert(fuzzy_match_levenshtein(["cat", "kitten", "cats"])("ctz") == "cat") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment