Last active
August 29, 2015 14:03
-
-
Save yogthos/110dc9dcc94831266632 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns bloom.core | |
(:require [com.github.kyleburton.clj-bloom | |
:refer [make-optimal-filter | |
optimal-n-and-k | |
make-permuted-hash-fn | |
make-hash-fn-crc32 | |
add! | |
include?]]) | |
(:import [java.util BitSet])) | |
(def threshold 0.2) | |
(def filters (atom {})) | |
(defn get-words [string] | |
(-> string | |
clojure.string/lower-case | |
(clojure.string/split #"\W"))) | |
(defn add-filter! [id string] | |
(let [words (get-words string) | |
num-words (count words)] | |
(if (pos? num-words) | |
(let [f (make-optimal-filter num-words threshold)] | |
(doseq [word words] (add! f word)) | |
(swap! filters assoc id f))))) | |
(defn contains-words? [words f] | |
(not-empty (filter (partial include? f) words))) | |
(defn search [search-string] | |
(let [search-terms (get-words search-string)] | |
(reduce | |
(fn [matches [id bloom-filter]] | |
(if (contains-words? search-terms bloom-filter) | |
(conj matches id) | |
matches)) | |
[] @filters))) | |
(defn long->bitset [value] | |
(let [bits (BitSet.)] | |
(loop [cur-val value | |
index 0] | |
(if (pos? cur-val) | |
(do | |
(when (pos? (mod cur-val 2)) | |
(.set bits index)) | |
(recur (unsigned-bit-shift-right cur-val 1) (inc index))) | |
bits)))) | |
(defn bitset->long [bits] | |
(loop [n 0 | |
i (.length bits)] | |
(if (not= i -1) | |
(recur (+ n (if (.get bits i) (bit-shift-left 1 i) 0)) (dec i)) | |
n))) | |
(defn serialize [] | |
(for [[id {:keys [num-bits bitarray insertions]}] @filters] | |
{:id (name id) | |
:size num-bits | |
:bitarray (bitset->long bitarray) | |
:insertions @insertions})) | |
(defn deserialize [filters] | |
(reduce | |
(fn [m {:keys [id size bitarray insertions]}] | |
(let [[_ k] (optimal-n-and-k size threshold)] | |
(assoc m (keyword id) | |
{:hash-fn (make-permuted-hash-fn make-hash-fn-crc32 (map str (range 0 k))) | |
:num-bits size | |
:bitarray (long->bitset bitarray) | |
:insertions (atom insertions)}))) | |
{} filters)) | |
;; examples | |
(reset! filters {}) | |
;; add some support notes | |
(add-filter! :ws-json "hosted on http://somehost01d.uhn.ca:19480/ws-json blah blah") | |
(add-filter! :tracker "tracker is deployed to http://somehost01d.uhn.ca:19480/tracker something something") | |
;; search for stuff in them | |
(search "hosted somehost01d") | |
(search "foo bar") | |
(search "tracker") | |
;; serialize and deserialize and test that stuff still works | |
;; serialized format will have the following format | |
;; id -> string | |
;; size long | |
;; bitarray long | |
;; insertions long | |
(deserialize (serialize)) | |
(reset! filters (deserialize (serialize))) | |
(search "hosted somehost01d") | |
(search "foo bar") | |
(search "tracker") | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defproject bloom "0.1.0-SNAPSHOT" | |
:description "FIXME: write description" | |
:url "http://example.com/FIXME" | |
:license {:name "Eclipse Public License" | |
:url "http://www.eclipse.org/legal/epl-v10.html"} | |
:dependencies [[org.clojure/clojure "1.6.0"] | |
[com.github.kyleburton/clj-bloom "1.0.1"]]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment