Created
February 18, 2011 01:25
-
-
Save fitzgen/833090 to your computer and use it in GitHub Desktop.
Uses a slightly modified version of the Levenshtien Distance algorithm (also known as "edit distance") to calculate the set of the fewest operations which will change string s in to string t.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Output of the example which is console.logged at the bottom of operations.js: | |
| $ node operations.js | |
| To change 'kitten' in to 'sitting': | |
| [ [ 'delete', 'k' ] | |
| , [ 'insert', 's' ] | |
| , [ 'retain', 1 ] | |
| , [ 'retain', 1 ] | |
| , [ 'retain', 1 ] | |
| , [ 'delete', 'e' ] | |
| , [ 'insert', 'i' ] | |
| , [ 'retain', 1 ] | |
| , [ 'insert', 'g' ] | |
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (function () { | |
| // Simple operation constructors. | |
| function insert (chars) { | |
| return ["insert", chars]; | |
| } | |
| function del (chars) { | |
| return ["delete", chars]; | |
| } | |
| function retain (n) { | |
| return ["retain", n]; | |
| } | |
| // We don't want to copy arrays all the time, aren't mutating lists, and | |
| // only need O(1) prepend and length, we can get away with a custom singly | |
| // linked list implementation. | |
| var theEmptyList = { | |
| length: 0, | |
| toArray: function () { | |
| return []; | |
| } | |
| }; | |
| function toArray () { | |
| return [this.car].concat(this.cdr.toArray()); | |
| } | |
| function cons (car, cdr) { | |
| return { | |
| car: car, | |
| cdr: cdr, | |
| length: 1 + cdr.length, | |
| toArray: toArray | |
| }; | |
| } | |
| // Abstract out the table in case I want to change the implementation to | |
| // arrays of arrays or something. | |
| function put (table, x, y, ops) { | |
| return (table[String(x) + "," + String(y)] = ops); | |
| } | |
| function get (table, x, y) { | |
| var ops = table[String(x) + "," + String(y)]; | |
| if ( ops ) { | |
| return ops; | |
| } else { | |
| throw new TypeError("No operations at " + String(x) + "," + String(y)); | |
| } | |
| } | |
| function makeOperationsTable (s, t) { | |
| var table = {}, | |
| n = s.length, | |
| m = t.length, | |
| i, | |
| j; | |
| put(table, 0, 0, theEmptyList); | |
| for ( i = 1; i <= m; i += 1 ) { | |
| put(table, i, 0, cons(insert(t.charAt(i-1)), | |
| get(table, i-1, 0))); | |
| } | |
| for ( j = 1; j <= n; j += 1 ) { | |
| put(table, 0, j, cons(del(s.charAt(j-1)), | |
| get(table, 0, j-1))); | |
| } | |
| return table; | |
| } | |
| function chooseCell (table, x, y, k) { | |
| var prevOps = get(table, x, y-1), | |
| min = prevOps.length, | |
| direction = "up"; | |
| if ( get(table, x-1, y).length < min ) { | |
| prevOps = get(table, x-1, y); | |
| min = prevOps.length; | |
| direction = "left"; | |
| } | |
| if ( get(table, x-1, y-1).length < min ) { | |
| prevOps = get(table, x-1, y-1); | |
| min = prevOps.length; | |
| direction = "diagonal"; | |
| } | |
| return k(direction, prevOps); | |
| } | |
| function operations (s, t) { | |
| var n = s.length, | |
| m = t.length, | |
| i, | |
| j, | |
| ops = makeOperationsTable(s, t); | |
| for ( i = 1; i <= m; i += 1 ) { | |
| for ( j = 1; j <= n; j += 1 ) { | |
| chooseCell(ops, i, j, function (direction, prevOps) { | |
| switch ( direction ) { | |
| case "left": | |
| put(ops, i, j, cons(insert(t.charAt(i-1)), prevOps)); | |
| break; | |
| case "up": | |
| put(ops, i, j, cons(del(s.charAt(j-1)), prevOps)); | |
| break; | |
| case "diagonal": | |
| if ( s.charAt(j-1) === t.charAt(i-1) ) { | |
| put(ops, i, j, cons(retain(1), prevOps)); | |
| } else { | |
| put(ops, i, j, cons(insert(t.charAt(i-1)), | |
| cons(del(s.charAt(j-1)), | |
| prevOps))); | |
| } | |
| break; | |
| default: | |
| throw new TypeError("Unknown direction."); | |
| } | |
| }); | |
| } | |
| } | |
| return get(ops, i-1, j-1).toArray().reverse(); | |
| } | |
| exports = typeof exports !== "undefined" | |
| ? exports | |
| : this; | |
| exports.operations = operations; | |
| console.log("To change 'kitten' in to 'sitting':"); | |
| console.log(operations("kitten", "sitting")); | |
| }()); |
Author
Thanks for those links!
I'm aware that you only need 2 columns of the matrix at any given time, I just wasn't sure it was worth the hassle. In my defense, I did only just throw this together :)
I'm actually much more interested in what the operations are, rather than what the edit distance is. I think this could be a promising technique for generating the operations for Operational Transformation (which is also why I split substitution in to insert and delete, and have retain all over the place).
no prob :)
this is also a cool demo w/cost adjustments but all done server-side:
http://odur.let.rug.nl/~kleiweg/lev/
Author
And if Lisp is your thing: https://github.com/vsedach/vas-string-metrics
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
check this one out:
https://gist.github.com/527658
also:
http://snippets.dzone.com/posts/show/6942
would be pretty cool to add a Damerau–Levenshtein distance variation where swapping characters is also considered a single-cost operation. it takes bit more mem for the optmizied version since you need to store a history of 2 back. i think both are more efficient than this one since it looks like you pre-build the entire table.
some speed optimization for just getting distance is a quick check against source and target string length and returning the full length of the other when either is an empty string.
var l1 = str1.length, l2 = str2.length; if (Math.min(l1, l2) === 0) { return Math.max(l1, l2); }http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#JavaScript