Created
November 7, 2010 13:01
-
-
Save djpowell/666114 to your computer and use it in GitHub Desktop.
lcs diff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns net.djpowell.lcsdiff | |
(:use [clojure.test]) | |
(:require [clojure.data])) | |
(defn- find-lcs-comm | |
"Take a calculated matrix, and track back through it to extract a | |
list of matching elements. Returns [left, right, both]." | |
[matrix a b i j] | |
(loop [i i j j left nil right nil both nil] | |
(cond | |
;; empty left seq | |
(= i 0) | |
[left (concat (take j b) right) both] | |
;; empty right seq | |
(= j 0) | |
[(concat (take i a) left) right both] | |
:else | |
(let [curr-len (aget matrix i j)] | |
(cond | |
;; shorten vertically | |
(= (aget matrix i (dec j)) curr-len) | |
(recur i (dec j) left (cons (nth b (dec j)) right) both) | |
;; shorten horizontally | |
(= (aget matrix (dec i) j) curr-len) | |
(recur (dec i) j (cons (nth a (dec i)) left) right both) | |
;; match at current position, add the current pair | |
:else | |
(recur (dec i) (dec j) left right (cons (nth a (dec i)) both))))))) | |
(defn- compute-matrix | |
"Compute matrix representing longest common subsequence using the | |
'dynamic-programming' method" | |
[a b] | |
(let [matrix (make-array Integer/TYPE (inc (count a)) (inc (count b)))] | |
(dotimes [x (inc (count a))] | |
(dotimes [y (inc (count b))] | |
(aset-int matrix x y | |
(cond | |
;; left or top edge | |
(or (= x 0) (= y 0)) | |
0 | |
;; compute length of match | |
(= (nth a (dec x)) (nth b (dec y))) | |
(inc (aget matrix (dec x) (dec y))) | |
;; compute length of non-match | |
:else | |
(max (aget matrix (dec x) y) (aget matrix x (dec y))))))) | |
matrix)) | |
(defn lcs-diff | |
"Take two sequences, and calculate the elements that can match by | |
return the elements that would align if the sequences were padded" | |
[a b] | |
(find-lcs-comm (compute-matrix a b) a b (count a) (count b))) | |
;; ---------------------------------------- | |
;; tests | |
(deftest threes-and-fours | |
(is (= | |
(lcs-diff (range 0 30 3) (range 0 30 4)) | |
[ [3 6 9 15 18 21 27] [4 8 16 20 28] [0 12 24] ]))) | |
(deftest human-chimp | |
(is (= | |
(lcs-diff (seq "human") (seq "chimpanzee")) | |
[ [\u] [\c \i \p \z \e \e] [\h \m \a \n] ]))) | |
;; ---------------------------------------- | |
;; hack to override diff for demonstration | |
;; override the definition from clojure.data | |
(in-ns 'clojure.data) | |
(extend-protocol Diff | |
java.util.List | |
(diff-similar [a b] | |
(net.djpowell.lcsdiff/lcs-diff a b))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
An alternative implementation of clojure.data/diff for sequences.
The implementation in clojure.data/diff treats sequences as indexed tuples, and compares the elements recursively position-by-position.
This implementaiton treats sequences as sequences, and compares the elements non-recursively, using a 'Longest Common Subsequence' algorithm, much like Unix diff on the lines of a file, returning an optimised result that matches as many elements as possible while retaining the order of the elements.
For example, given the [1 2 3] and [2 3], this algorithm returns [1] to be unique to the first list, no elements to be unique to the second list, and [2 3] common to both lists.
The clojure.data/diff algorithm returns that the two lists have nothing in common because they don't share identical elements at any position.
clojure.data/diff:
this implementation: