Created
October 21, 2025 21:01
-
-
Save abailly/556d326ae3ab2217c07bc196dc3c0591 to your computer and use it in GitHub Desktop.
A simple bloom filter with some interesting properties
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| module BloomSpec where | |
| import Data.Bits (setBit, (.&.), (.|.)) | |
| import Data.Function ((&)) | |
| import Test.Hspec (Spec) | |
| import Test.Hspec.QuickCheck (modifyMaxDiscardRatio, modifyMaxSuccess, prop) | |
| import Test.QuickCheck | |
| ( Confidence (..), | |
| Gen, | |
| Property, | |
| checkCoverageWith, | |
| counterexample, | |
| coverTable, | |
| elements, | |
| forAll, | |
| listOf1, | |
| property, | |
| stdConfidence, | |
| suchThat, | |
| tabulate, | |
| vectorOf, | |
| (==>), | |
| ) | |
| import Prelude hiding (Word) | |
| spec :: Spec | |
| spec = do | |
| prop "always return true if word in filter" $ prop_alwaysTrueIfInFilter | |
| prop "hash function has consistent collision rate" $ prop_has_consistent_collision_rate | |
| modifyMaxSuccess (const 1) $ | |
| modifyMaxDiscardRatio (const 10) $ | |
| prop "return true if word not in filter with some probability" $ | |
| prop_mayReturnTrueIfNotInFilter | |
| prop_has_consistent_collision_rate :: Property | |
| prop_has_consistent_collision_rate = | |
| forAll anyWord $ \w -> | |
| forAll anyWord $ \w' -> | |
| (w /= w') ==> | |
| let hash = hashWithSalt 1 w `mod` 100 | |
| hash' = hashWithSalt 1 w' `mod` 100 | |
| in property True | |
| & tabulate "uniqueHashes" [show (hash == hash')] | |
| & coverTable "uniqueHashes" [("True", 1), ("False", 99)] | |
| & checkCoverageWith (stdConfidence {tolerance = 0.8}) | |
| prop_mayReturnTrueIfNotInFilter :: Property | |
| prop_mayReturnTrueIfNotInFilter = | |
| forAll dictionary $ \dict -> | |
| let bloom = makeBloomFilter m k dict | |
| n = fromIntegral $ length dict | |
| m = 2 ^ 13 | |
| k = 3 | |
| in forAll (anyWord `suchThat` (`notElem` dict)) $ \w -> | |
| let result = bloom `contains` w | |
| falsePositiveProbability = 100.0 * (1 - exp (negate (fromIntegral k * n / fromIntegral m))) ** fromIntegral k | |
| in property True | |
| & tabulate "result" [show result] | |
| & coverTable "result" [("True", falsePositiveProbability), ("False", 100 - falsePositiveProbability)] | |
| & counterexample | |
| ( "False positive probability: " | |
| ++ show falsePositiveProbability | |
| ++ "%, m: " | |
| ++ show m | |
| ++ ", n: " | |
| ++ show n | |
| ++ ", k: " | |
| ++ show k | |
| ) | |
| & checkCoverageWith (stdConfidence {tolerance = 0.5}) | |
| prop_alwaysTrueIfInFilter :: Property | |
| prop_alwaysTrueIfInFilter = | |
| forAll dictionary $ \dict -> | |
| let bloom = makeBloomFilter m k dict | |
| m = 2 ^ 12 | |
| k = 1 | |
| in forAll (elements dict) $ \w -> | |
| bloom `contains` w | |
| -- * The Code | |
| contains :: BloomFilter -> Word -> Bool | |
| contains (BloomFilter m k bitset) word = | |
| let wordBits = bitsForWord m k word bitset | |
| in (bitset .&. wordBits) == wordBits | |
| anyWord :: Gen Word | |
| anyWord = fmap Word (listOf1 (elements ['a' .. 'z'])) | |
| newtype Word = Word String | |
| deriving (Show, Eq) | |
| data BloomFilter = BloomFilter {m :: Int, k :: Int, bitset :: Integer} | |
| deriving (Show, Eq) | |
| makeBloomFilter :: Int -> Int -> [Word] -> BloomFilter | |
| makeBloomFilter m k = go 0 | |
| where | |
| go bitset [] = BloomFilter m k bitset | |
| go bitset (w : ws) = | |
| let wordBits = bitsForWord m k w bitset | |
| newBitset = bitset .|. wordBits | |
| in go newBitset ws | |
| bitsForWord :: Int -> Int -> Word -> Integer -> Integer | |
| bitsForWord m k w bitset = | |
| foldl | |
| ( \bs i -> | |
| let h = hashWithSalt i w | |
| bit = h `mod` m | |
| in setBit bs bit | |
| ) | |
| bitset | |
| [0 .. k - 1] | |
| hashWithSalt :: Int -> Word -> Int | |
| hashWithSalt salt (Word s) = | |
| foldl (\h c -> h * 31 + fromEnum c + salt) 0 s | |
| dictionary :: Gen [Word] | |
| dictionary = vectorOf 1000 anyWord |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment