Created
October 29, 2020 02:44
-
-
Save boyter/0cffa9ff8e2e7259d455594d744f1164 to your computer and use it in GitHub Desktop.
Bloom filter example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Primitive hash function that for a string returns a positive 32 bit int | |
// Do not use in production, use murmur3 or fnv1 | |
// You can improve this by changing 5 to 31 | |
Object.defineProperty(String.prototype, 'hashCode', { | |
value: function() { | |
var hash = 0, i, chr; | |
for (i = 0; i < this.length; i++) { | |
chr = this.charCodeAt(i); | |
hash = ((hash << 5) - hash) + chr; | |
hash |= 0; // Convert to 32bit integer | |
} | |
return Math.abs(hash); | |
} | |
}); | |
// start out filter out with 16 0's indicating its empty | |
// note that this is wasteful because we are using numbers | |
// ideally you would use boolean or bits to reduce the space | |
// and using bits allows you to do all sorts of interesting things... | |
var bloom = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; | |
// add some words to the filter using a single hash | |
// we use % to convert our 32 bit number into one of 16 values | |
// for the length of our bloom filter | |
bloom["something".hashCode() % bloom.length] = 1; | |
bloom["another".hashCode() % bloom.length] = 1; | |
// lets check if notin is in the filter | |
if (bloom["notin".hashCode() % bloom.length] == 1) { | |
console.log("notin may be in set..."); | |
} else { | |
console.log("notin is not in set"); | |
} | |
// check if something is in the filter | |
if (bloom["something".hashCode() % bloom.length] == 1) { | |
console.log("something may be in set..."); | |
} else { | |
console.log("something is not in set"); | |
} | |
// check if boyter is in the filter | |
// this is a false positive! boyter was never added | |
if (bloom["boyter".hashCode() % bloom.length] == 1) { | |
console.log("boyter may be in set... FALSE POSITIVE it was never added"); | |
} else { | |
console.log("boyter is not in set"); | |
} | |
// lets reset and try a two hash filter where each term is hashed twice to produce different output | |
var bloom = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]; | |
var term = "something"; | |
bloom[(term + "-one").hashCode() % bloom.length] = 1; | |
bloom[(term + "-two").hashCode() % bloom.length] = 1; | |
// note that two bits are now set | |
console.log(bloom); | |
var term = "another"; | |
bloom[(term + "-one").hashCode() % bloom.length] = 1; | |
bloom[(term + "-two").hashCode() % bloom.length] = 1; | |
// note that we can set the same bits | |
console.log(bloom) | |
// lets check if boyter still gives us a false positive | |
var term = "boyter"; | |
var bit1 = (term + "-one").hashCode() % bloom.length; | |
var bit2 = (term + "-two").hashCode() % bloom.length; | |
if (bloom[bit1] == 1 && bloom[bit2] == 1) { | |
console.log("boyter may be in set... FALSE POSITIVE"); | |
} else { | |
console.log("boyter is not in set"); | |
} | |
// lets reset and try a two hash filter with a much bigger bloom filter, and then try checking with many tests | |
var bloom = []; | |
for (var i=0; i<100; i++) { | |
bloom.push(0) | |
} | |
// add two terms | |
var term = "something"; | |
bloom[(term + "-one").hashCode() % bloom.length] = 1; | |
bloom[(term + "-two").hashCode() % bloom.length] = 1; | |
var term = "another"; | |
bloom[(term + "-one").hashCode() % bloom.length] = 1; | |
bloom[(term + "-two").hashCode() % bloom.length] = 1; | |
// lets try 1000 terms and see how many false positives we get | |
var falsePositive = 0; | |
for (var i=0; i<1000; i++) { | |
var term = "" + i; | |
var bit1 = (term + "-one").hashCode() % bloom.length; | |
var bit2 = (term + "-two").hashCode() % bloom.length; | |
if (bloom[bit1] == 1 && bloom[bit2] == 1) { | |
falsePositive++; | |
} | |
} | |
console.log("falsePositive:", falsePositive); | |
// lets reset with a much much bigger bloom filter and a lot more terms and see what happens | |
var bloom = []; | |
for (var i=0; i<100000; i++) { | |
bloom.push(0) | |
} | |
for (var i=0; i<1000; i++) { | |
var term = "something" + i; | |
var bit1 = (term + "-one").hashCode() % bloom.length; | |
var bit2 = (term + "-two").hashCode() % bloom.length; | |
bloom[bit1] = 1; | |
bloom[bit2] = 1; | |
} | |
var falsePositive = 0; | |
for (var i=0; i<1000; i++) { | |
var term = "" + i; | |
var bit1 = (term + "-one").hashCode() % bloom.length; | |
var bit2 = (term + "-two").hashCode() % bloom.length; | |
if (bloom[bit1] == 1 && bloom[bit2] == 1) { | |
falsePositive++; | |
} | |
} | |
console.log("falsePositive:", falsePositive); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment