Skip to content

Instantly share code, notes, and snippets.

@yogthos
Last active August 29, 2015 14:03
Show Gist options
  • Save yogthos/110dc9dcc94831266632 to your computer and use it in GitHub Desktop.
Save yogthos/110dc9dcc94831266632 to your computer and use it in GitHub Desktop.
(ns bloom.core
(:require [com.github.kyleburton.clj-bloom
:refer [make-optimal-filter
optimal-n-and-k
make-permuted-hash-fn
make-hash-fn-crc32
add!
include?]])
(:import [java.util BitSet]))
(def threshold 0.2)
(def filters (atom {}))
(defn get-words [string]
(-> string
clojure.string/lower-case
(clojure.string/split #"\W")))
(defn add-filter! [id string]
(let [words (get-words string)
num-words (count words)]
(if (pos? num-words)
(let [f (make-optimal-filter num-words threshold)]
(doseq [word words] (add! f word))
(swap! filters assoc id f)))))
(defn contains-words? [words f]
(not-empty (filter (partial include? f) words)))
(defn search [search-string]
(let [search-terms (get-words search-string)]
(reduce
(fn [matches [id bloom-filter]]
(if (contains-words? search-terms bloom-filter)
(conj matches id)
matches))
[] @filters)))
(defn long->bitset [value]
(let [bits (BitSet.)]
(loop [cur-val value
index 0]
(if (pos? cur-val)
(do
(when (pos? (mod cur-val 2))
(.set bits index))
(recur (unsigned-bit-shift-right cur-val 1) (inc index)))
bits))))
(defn bitset->long [bits]
(loop [n 0
i (.length bits)]
(if (not= i -1)
(recur (+ n (if (.get bits i) (bit-shift-left 1 i) 0)) (dec i))
n)))
(defn serialize []
(for [[id {:keys [num-bits bitarray insertions]}] @filters]
{:id (name id)
:size num-bits
:bitarray (bitset->long bitarray)
:insertions @insertions}))
(defn deserialize [filters]
(reduce
(fn [m {:keys [id size bitarray insertions]}]
(let [[_ k] (optimal-n-and-k size threshold)]
(assoc m (keyword id)
{:hash-fn (make-permuted-hash-fn make-hash-fn-crc32 (map str (range 0 k)))
:num-bits size
:bitarray (long->bitset bitarray)
:insertions (atom insertions)})))
{} filters))
;; examples
(reset! filters {})
;; add some support notes
(add-filter! :ws-json "hosted on http://somehost01d.uhn.ca:19480/ws-json blah blah")
(add-filter! :tracker "tracker is deployed to http://somehost01d.uhn.ca:19480/tracker something something")
;; search for stuff in them
(search "hosted somehost01d")
(search "foo bar")
(search "tracker")
;; serialize and deserialize and test that stuff still works
;; serialized format will have the following format
;; id -> string
;; size long
;; bitarray long
;; insertions long
(deserialize (serialize))
(reset! filters (deserialize (serialize)))
(search "hosted somehost01d")
(search "foo bar")
(search "tracker")
(defproject bloom "0.1.0-SNAPSHOT"
:description "FIXME: write description"
:url "http://example.com/FIXME"
:license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.6.0"]
[com.github.kyleburton/clj-bloom "1.0.1"]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment