Created
April 1, 2014 04:56
-
-
Save TheRayTracer/9907950 to your computer and use it in GitHub Desktop.
A simple implementation of a Bloom Filter using two hash functions. A Bloom Filter is used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not; i.e. a query returns either "possibly in set" or "definitely not in set". This sample shows an example of a false positive.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <bitset> | |
#include <vector> | |
#include <cassert> | |
namespace std { } | |
using namespace std; | |
typedef size_t (*pHashFunc)(const char*); | |
size_t hash_jenkins(const char* s) | |
{ | |
size_t hash = 0; | |
while (*s != '\0') | |
{ | |
hash += *s++; | |
hash += (hash << 10); | |
hash ^= (hash >> 6); | |
} | |
hash += (hash << 3); | |
hash ^= (hash >> 11); | |
hash += (hash << 15); | |
return hash; | |
} | |
size_t hash_pearson(const char* s) | |
{ | |
static const unsigned char T[256] = { | |
98, 6, 85,150, 36, 23,112,164,135,207,169, 5, 26, 64,165,219, // 1 | |
61, 20, 68, 89,130, 63, 52,102, 24,229,132,245, 80,216,195,115, // 2 | |
90,168,156,203,177,120, 2,190,188, 7,100,185,174,243,162, 10, // 3 | |
237, 18,253,225, 8,208,172,244,255,126,101, 79,145,235,228,121, // 4 | |
123,251, 67,250,161, 0,107, 97,241,111,181, 82,249, 33, 69, 55, // 5 | |
59,153, 29, 9,213,167, 84, 93, 30, 46, 94, 75,151,114, 73,222, // 6 | |
197, 96,210, 45, 16,227,248,202, 51,152,252,125, 81,206,215,186, // 7 | |
39,158,178,187,131,136, 1, 49, 50, 17,141, 91, 47,129, 60, 99, // 8 | |
154, 35, 86,171,105, 34, 38,200,147, 58, 77,118,173,246, 76,254, // 9 | |
133,232,196,144,198,124, 53, 4,108, 74,223,234,134,230,157,139, // 10 | |
189,205,199,128,176, 19,211,236,127,192,231, 70,233, 88,146, 44, // 11 | |
183,201, 22, 83, 13,214,116,109,159, 32, 95,226,140,220, 57, 12, // 12 | |
221, 31,209,182,143, 92,149,184,148, 62,113, 65, 37, 27,106,166, // 13 | |
3, 14,204, 72, 21, 41, 56, 66, 28,193, 40,217, 25, 54,179,117, // 14 | |
238, 87,240,155,180,170,242,212,191,163, 78,218,137,194,175,110, // 15 | |
43,119,224, 71,122,142, 42,160,104, 48,247,103, 15, 11,138,239 // 16 | |
}; | |
size_t hash = 0; | |
while (*s != '\0') | |
{ | |
size_t index = hash ^ *s++; | |
hash = T[index]; | |
} | |
return hash; | |
} | |
template<size_t NumberOfBits> | |
class bloom_filter | |
{ | |
public: | |
void add_hash_function(pHashFunc hash); | |
void clear_hash_functions(); | |
void add(const char* element); | |
bool contains(const char* element); | |
private: | |
vector<pHashFunc> hash_functions; | |
bitset<NumberOfBits> store; | |
}; | |
template<size_t NumberOfBits> | |
void bloom_filter<NumberOfBits>::add_hash_function(pHashFunc hash) | |
{ | |
hash_functions.push_back(hash); | |
return; | |
} | |
template<size_t NumberOfBits> | |
void bloom_filter<NumberOfBits>::clear_hash_functions() | |
{ | |
hash_functions.clear(); | |
return; | |
} | |
template<size_t NumberOfBits> | |
void bloom_filter<NumberOfBits>::add(const char* element) | |
{ | |
for (size_t i = 0; i < hash_functions.size(); ++i) | |
{ | |
size_t hash = hash_functions[i](element); | |
size_t bit = hash % NumberOfBits; | |
store.set(bit); | |
} | |
return; | |
} | |
template<size_t NumberOfBits> | |
bool bloom_filter<NumberOfBits>::contains(const char* element) | |
{ | |
for (size_t i = 0; i < hash_functions.size(); ++i) | |
{ | |
size_t hash = hash_functions[i](element); | |
size_t bit = hash % NumberOfBits; | |
if (store.test(bit) == false) | |
{ | |
return false; | |
} | |
} | |
return true; // Possibly in set. | |
} | |
int main(int argc, char* argv[]) | |
{ | |
bloom_filter<10000> bloom; | |
bloom.add_hash_function(hash_jenkins); | |
bloom.add_hash_function(hash_pearson); | |
bloom.add("afa"); | |
bloom.add("arm"); | |
bloom.add("cat"); | |
bloom.add("dig"); | |
bloom.add("dog"); | |
assert(bloom.contains("owl") == false); // Not in the list (TN). | |
assert(bloom.contains("cat") != false); // In the list (TP). | |
assert(bloom.contains("icv") != false); // Not in the list (FP) - "afa" and "icv" set the same bits using both the Jenkin Hash function, and the Pearson Hash function. | |
cout << "Bloom filter checks were successful." << endl; | |
cin.get(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment