Created
June 29, 2012 17:17
-
-
Save warmist/3019332 to your computer and use it in GitHub Desktop.
Bloom implementacija
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <iostream> | |
#include <fstream> | |
using namespace std; | |
vector<bool> bloom; | |
const unsigned int bloomsize=1024; | |
//----------------------------------------------------------------------------- | |
// MurmurHash2, by Austin Appleby | |
// Note - This code makes a few assumptions about how your machine behaves - | |
// 1. We can read a 4-byte value from any address without crashing | |
// 2. sizeof(int) == 4 | |
// And it has a few limitations - | |
// 1. It will not work incrementally. | |
// 2. It will not produce the same results on little-endian and big-endian | |
// machines. | |
struct StringHash | |
{ | |
unsigned int v1,v2,v3; | |
StringHash(const string &a) | |
{ | |
/*v1=sumhash(a,1337)%bloomsize; | |
v2=sumhash(a,13)%bloomsize; | |
v3=sumhash(a,109321)%bloomsize;*/ | |
//* | |
v1=Hash1(a,1337)%bloomsize; | |
v2=Hash1(a,13)%bloomsize; | |
v3=Hash1(a,109321)%bloomsize; | |
//*/ | |
} | |
unsigned int sumhash(const string &a,int seed) | |
{ | |
unsigned int sum=seed; | |
for(size_t i=0;i<a.size();i++) | |
{ | |
sum+= (seed^ a[i])*seed+ (! a[i]); | |
} | |
return sum; | |
} | |
unsigned int Hash1(const string& a,int seed) | |
{ | |
string b=a; | |
if(b.size()%4!=0) | |
b.append(" ",4-b.size()%4); | |
return MurmurHash2(b.c_str(),b.size(),seed); | |
} | |
unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) | |
{ | |
// 'm' and 'r' are mixing constants generated offline. | |
// They're not really 'magic', they just happen to work well. | |
const unsigned int m = 0x5bd1e995; | |
const int r = 24; | |
// Initialize the hash to a 'random' value | |
unsigned int h = seed ^ len; | |
// Mix 4 bytes at a time into the hash | |
const unsigned char * data = (const unsigned char *)key; | |
while(len >= 4) | |
{ | |
unsigned int k = *(unsigned int *)data; | |
k *= m; | |
k ^= k >> r; | |
k *= m; | |
h *= m; | |
h ^= k; | |
data += 4; | |
len -= 4; | |
} | |
// Handle the last few bytes of the input array | |
switch(len) | |
{ | |
case 3: | |
h ^= data[2] << 16; | |
case 2: | |
h ^= data[1] << 8; | |
case 1: | |
h ^= data[0]; | |
h *= m; | |
}; | |
// Do a few final mixes of the hash to ensure the last few | |
// bytes are well-incorporated. | |
h ^= h >> 13; | |
h *= m; | |
h ^= h >> 15; | |
return h; | |
} | |
bool Contains(vector<bool>& vec) | |
{ | |
return vec[v1] && vec[v2] && vec[v3]; | |
} | |
bool Put(vector<bool>&vec) | |
{ | |
vec[v1]=true; | |
vec[v2]=true; | |
vec[v3]=true; | |
} | |
}; | |
int main() | |
{ | |
bloom.resize(bloomsize,false); | |
fstream input("bloom-words.txt",ios::in); | |
while(!input.eof()) | |
{ | |
string in; | |
input>>in; | |
StringHash H(in); | |
H.Put(bloom); | |
//cout<<in<<" :"<<H.v1<<" "<<H.v2<<" "<<H.v3<<endl; | |
} | |
fstream input2("bloom-test.txt"); | |
while(!input2.eof()) | |
{ | |
string in; | |
input2>>in; | |
StringHash H(in); | |
cout<<in<<" is in:"<<H.Contains(bloom)<<endl; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment