Created
October 27, 2014 22:20
-
-
Save yveszoundi/5a10ebbc13ea15192bce to your computer and use it in GitHub Desktop.
Levenshtein distance with a full matrix.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn levenshtein-distance [w1 w2] | |
(letfn [(distance [matrix] | |
(last (last matrix))) | |
(make-matrix [word1 word2] | |
(let [matrix (into [] | |
(for [i (range (inc (count word1)))] | |
(into [] (take (inc (count word2)) (iterate inc 0)))))] | |
;; new matrix | |
(reduce (fn [acc idx] | |
(assoc acc idx | |
(assoc (acc idx) 0 idx))) | |
matrix | |
(range (count matrix))))) | |
(cost [ch1 ch2] | |
(if (= ch1 ch2) 0 1)) | |
(cell-value | |
[ch-w1 ch-w2 matrix row-idx col-idx] | |
(let [top (inc ((matrix col-idx) (dec row-idx))) | |
top-left (+ (cost ch-w1 ch-w2) ((matrix (dec col-idx)) (dec row-idx))) | |
left (inc ((matrix (dec col-idx)) row-idx))] | |
(min top top-left left))) | |
(compute-distance [word1 word2 matrix] | |
(loop [row-idx 1, max-row (inc (count word2)), m matrix] | |
(if (= row-idx max-row) | |
(distance m) | |
(recur (inc row-idx) | |
max-row | |
(loop [col-idx 1, max-col-idx (inc (count word1)), m-mod m] | |
(if (>= col-idx max-col-idx) | |
m-mod | |
(let [ch-w1 (word1 (dec col-idx)) | |
ch-w2 (word2 (dec row-idx)) | |
cell-data (cell-value ch-w1 ch-w2 m-mod row-idx col-idx) | |
next-matrix (assoc m-mod | |
col-idx | |
(assoc (m-mod col-idx) | |
row-idx | |
cell-data))] | |
(recur (inc col-idx) | |
max-col-idx | |
next-matrix | |
))))))))] | |
(let [word1 (into [] w1), word2 (into [] w2) | |
matrix (make-matrix word1 word2)] | |
(compute-distance word1 word2 matrix)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment