Created
January 7, 2011 02:26
-
-
Save techbelly/769015 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
from BitVector import BitVector | |
class BloomFilter: | |
def __init__(self,bits,hashes,digest=hashlib.md5): | |
self.bits = bits | |
self.hashes = hashes | |
self.digest = digest | |
self.check_digest(bits,hashes,digest) | |
self.bloom = BitVector(intVal = 0, size = (2**bits)) | |
# self.bloom = [False] * (2**bits) | |
def check_digest(self,bits,hashes,digest): | |
if digest().digest_size*8 < bits * hashes: | |
raise ValueError( \ | |
'Digest size of %d bytes, for %d hashes of %d bits not enough' % \ | |
(digest().digest_size, hashes, bits)) | |
def hashes_for(self,string): | |
def chunks(l, n): | |
for i in xrange(0, len(l), n): | |
yield l[i:i+n] | |
def first(n,iter): | |
return (x for i,x in enumerate(iter) if i < n) | |
m = self.digest() | |
m.update(string.lower()) | |
as_int = int(m.hexdigest(),16) | |
as_bin = bin(as_int).lstrip('0b') | |
return [int(chunk,2) for chunk in first(self.hashes,chunks(as_bin,self.bits))] | |
def insert(self,string): | |
for i in self.hashes_for(string): | |
self.bloom[i] = 1 | |
def contains(self,string): | |
return all(self.bloom[i] for i in self.hashes_for(string)) | |
bits = 20 # | |
hashes = 4 | |
digest = hashlib.md5 | |
bloom = BloomFilter(bits, hashes, digest) | |
print "LOADING DICTIONARY" | |
for l in open('/usr/share/dict/words'): | |
bloom.insert(l.strip()) | |
def misspelt_words_in(text): | |
return [word for word in text.split() if not bloom.contains(word)] | |
text = """The bits of a bit array are stored in 16-bit unsigned ints. After | |
resolving the argument with which the constructor is called (which | |
happens in lines (A2) through (A70) of the file BitVector.py), the | |
very first thing that the constructor does is to figure out in line | |
(A78) as to how many of those 2-byte ints it needs for the bits. | |
For example, if you wanted to store a 64-bit array, the variable | |
'two_byte_ints_needed' in line (A78) would be set to 4. (This does | |
not mean that the size of a bit vector must be a multiple of 16. | |
Any sized bit vectors can constructed using the required number of | |
two-byte ints.) Line (A79) then creates an array of 2-byte ints and | |
initializes it with the required number of zeros. Lines (A80) then | |
shifts the bits into the array of two-byte ints. | |
This is a taest of sume mispelling and zarf""" | |
# More a test of words in /usr/share/dict/words than a spelling test, unfortunately. | |
print misspelt_words_in(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment