-
-
Save fogus/1125874 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns fraud.detectors.bktree) | |
(defn root [distance-fn] {:distance-fn distance-fn :children {} :matches []}) | |
(defn new-node [term] {:term term :children {} :matches []}) | |
(defn empty-node? [node] | |
(= nil (:term node))) | |
(defn insert | |
([element node] | |
(insert element node (:distance-fn node))) | |
([element node distance-fn] | |
(if (empty-node? node) | |
(assoc node :term element) | |
(let [dist (distance-fn (:term node) element) | |
children (:children node) | |
child (get children dist)] | |
(if (= 0 dist) | |
(merge-with conj node {:matches element}) | |
(if child | |
(assoc-in node [:children dist] (insert element child distance-fn)) | |
(assoc-in node [:children dist] (new-node element))))) | |
))) | |
(defn query | |
([term threshold node] | |
(flatten (query term threshold node (:distance-fn node) []))) | |
([term threshold node distance-fn matches] | |
(let [dist (distance-fn (:term node) term) | |
query-range (range (- dist threshold) (+ 1 (+ dist threshold))) | |
to-query (remove nil? (map (fn [score] (get (:children node) score)) query-range))] | |
(concat (if (<= dist threshold) [node]) | |
matches | |
(map (fn [n] (query term threshold n distance-fn matches)) to-query))))) | |
(defn treemap [a-fn root] | |
(if (:term root) | |
(map a-fn (tree-seq (fn is-branch? [node] (seq (:children node))) | |
(fn children [node] (vals (:children node))) | |
root)) | |
(list))) | |
;; Specs | |
(ns fraud.detectors.bktree-spec | |
(:use | |
[fraud.spec-helper] | |
[fraud.detectors.bktree :as bk] | |
[fraud.detectors.hamming] | |
; [fraud.detectors.levenshtein] | |
; [fraud.detectors.lcss] | |
; [fraud.detectors.n_grams] | |
[speclj.core])) | |
(defn should-have-same-contents [expected actual] | |
(should= (sort expected) (sort actual))) | |
(def chosen-distance-function hamming-distance) | |
(describe "building a tree" | |
(it "uses the first term as the root" | |
(let [tree (bk/insert "a" (bk/root chosen-distance-function))] | |
(should= "a" (:term tree)) | |
(should= 0 (count (:children tree))))) | |
(with tree (->> (bk/root chosen-distance-function) | |
(bk/insert "a") | |
(bk/insert "b") | |
(bk/insert "b"))) | |
(it "collects matches in a matched node" | |
(should-have-same-contents ["b"] (:matches (get (:children @tree) 1)))) | |
(it "inserts terms as child nodes in a sub-tree based on the node distance from the current node" | |
(should= "b" (:term (get (:children @tree) 1)))) | |
) | |
(describe "mapping a tree" | |
(with tree (bk/insert "b" (bk/insert "b" (bk/insert "a" (bk/root chosen-distance-function))))) | |
(it "maps the tree into a vector of nodes" | |
(should-have-same-contents ["a" "b"] (bk/treemap (fn [n] (:term n)) @tree))) | |
(it "returns an empty list when the tree is empty" | |
(should= true (empty? (bk/treemap (fn [n] n) (bk/root chosen-distance-function)))))) | |
(describe "querying a tree" | |
(with tree (->> (bk/root chosen-distance-function) | |
(bk/insert "a") | |
(bk/insert "b") | |
(bk/insert "c") | |
(bk/insert "bb"))) | |
(it "finds all nodes with a distance of 0 from the query term" | |
(let [result (bk/query "a" 0 @tree)] | |
(should-have-same-contents ["a"] (map :term result))) | |
(let [result (bk/query "c" 0 @tree)] | |
(should-have-same-contents ["c"] (map :term result)))) | |
(it "finds all nodes with a distance of 1 from the query term" | |
(let [result (bk/query "a" 1 @tree)] | |
(should-have-same-contents ["a" "b" "c"] (map :term result)))) | |
(it "works for Bo's experiment" | |
(let [tree (->> (bk/root chosen-distance-function) | |
(bk/insert "aa") | |
(bk/insert "ab") | |
(bk/insert "ba")) | |
result (bk/query "aa" 1 tree)] | |
(should-have-same-contents ["aa" "ab" "ba"] (map :term result)))) | |
(describe "with Kevin's big tree" | |
(with tree (->> (bk/root chosen-distance-function) | |
(bk/insert "book") | |
(bk/insert "rook") | |
(bk/insert "nook") | |
(bk/insert "took") | |
(bk/insert "look") | |
(bk/insert "shook") | |
(bk/insert "hand") | |
(bk/insert "sand") | |
(bk/insert "handle") | |
(bk/insert "handler") | |
(bk/insert "handles") | |
(bk/insert "handlers") | |
(bk/insert "bland"))) | |
(it "finds all terms 1 step from the root" | |
(let [result (bk/query "book" 1 @tree)] | |
(should-have-same-contents ["book" "rook" "nook" "took" "look"] (map :term result)))) | |
(it "finds all terms including root 1 step from the leaf of a branch" | |
(let [result (bk/query "look" 1 @tree)] | |
(should-have-same-contents ["book" "rook" "nook" "took" "look"] (map :term result)))) | |
(it "finds all terms including root 1 step from the query, but on an alternate branch" | |
(let [result (bk/query "sand" 1 @tree)] | |
(should-have-same-contents ["sand" "hand"] (map :term result)))) | |
(it "finds all terms including root 1 step from a leaf node on an alternate subtree" | |
(let [result (bk/query "hand" 1 @tree)] | |
(should-have-same-contents ["sand" "hand"] (map :term result)))) | |
; FAILS because we're using a padded hamming distance as the query fn, we should use something else | |
; (it "finds all terms that are two steps away from a node" | |
; (let [result (bk/query "shook" 2 @tree)] | |
; (should-have-same-contents ["book" "rook" "nook" "took" "look" "shook"] (map :term result)))) | |
; (it "finds results that have common suffix within threshold" | |
; (let [result (bk/query "bland" 2 @tree)] | |
; (should-have-same-contents ["bland" "sand" "hand"] (map :term result)))) | |
; (it "finds results that have common prefix within threshold" | |
; (let [result (bk/query "handle" 2 @tree)] | |
; (should-have-same-contents ["hand" "handle" "handler" "handles" "handlers"] (map :term result)))) | |
)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment