Created
March 6, 2010 20:14
-
-
Save fberger/323911 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <algorithm> | |
#include <map> | |
#include <vector> | |
#include <iterator> | |
#include <numeric> | |
#include <string.h> | |
using namespace std; | |
class BKNode; | |
/** | |
* Implementation of levenshtein distance that avoids any heap allocations and | |
* only needs O(2 * min(length(one), length(two))) space. | |
*/ | |
int levenshtein(const string& one, const string& two) { | |
if (one.length() > two.length()) { | |
return levenshtein(two, one); | |
} | |
int length1 = one.length() + 1; | |
int length2 = two.length() + 1; | |
int data1[length1]; | |
int data2[length1]; | |
int* current = data1; | |
int* previous = data2; | |
memset(data1, 0, length1 * sizeof(int)); | |
for (int i = 0; i < length1; i++) { | |
data2[i] = i; | |
} | |
for (int i = 1; i < length2; i++) { | |
if (i > 1) { | |
memset(previous, 0, length1 * sizeof(int)); | |
int* tmp = previous; | |
previous = current; | |
current = tmp; | |
} | |
current[0] = i; | |
for (int j = 1; j < length1; j++) { | |
current[j] = min(previous[j] + 1, min(current[j-1] + 1, previous[j-1] + | |
(one[j-1] != two[i-1] ? 1 : 0))); | |
} | |
} | |
return current[length1-1]; | |
} | |
/** | |
* Node structure of the BK-tree. | |
*/ | |
class BKNode { | |
private: | |
string value; | |
map<int, BKNode> children; | |
static int count; | |
public: | |
BKNode(const string& value) | |
: value(value) { | |
} | |
BKNode() { | |
} | |
/** | |
* Inserts a value into the tree. | |
*/ | |
void insert(const string& value) { | |
int distance = levenshtein(this->value, value); | |
if (distance == 0) { | |
return; | |
} | |
map<int,BKNode>::const_iterator i(children.find(distance)); | |
if (i != children.end()) { | |
// use [] operator to avoid copy constructor | |
children[distance].insert(value); | |
} else { | |
// only copy constructor call of BKNode | |
children[distance] = BKNode(value); | |
} | |
} | |
/** | |
* Computes minimum distance between value and this node or its children. | |
*/ | |
int minimumTreeDistance(const string& value, int currentMin) { | |
++count; // count invocations | |
int distance = levenshtein(this->value, value); | |
if (distance == 0 || children.empty()) { | |
return distance; | |
} | |
currentMin = min(distance, currentMin); | |
map<int,BKNode>::const_iterator i(children.find(distance)); | |
if (i != children.end()) { | |
currentMin = min(children[distance].minimumTreeDistance(value, currentMin), | |
currentMin); | |
} | |
if (currentMin <= 1) { | |
return currentMin; | |
} | |
for (int i = distance - currentMin + 1; i < distance + currentMin; ++i) { | |
if (i != distance && children.find(i) != children.end()) { | |
currentMin = min(children[i].minimumTreeDistance(value, currentMin), | |
currentMin); | |
if (currentMin <= 1) { | |
return currentMin; | |
} | |
} | |
} | |
return currentMin; | |
} | |
}; | |
int BKNode::count = 0; | |
/** | |
* Implementation of a Burkhard Keller tree for efficient retrieval | |
* of string values based on the Levenshtein distance as metric. | |
* | |
* Most logic is contained in BKNode class. | |
*/ | |
class BKTree : public unary_function<int, const string&> { | |
private: | |
BKNode root; | |
int count; | |
public: | |
BKTree(const string& rootValue) | |
: root(BKNode(rootValue)) { | |
} | |
void insert(const string& value) { | |
root.insert(value); | |
++count; | |
} | |
/** | |
* Computes minimal distance between value and any value in the BK-tree. | |
*/ | |
int operator() (const string& value) { | |
int distance = root.minimumTreeDistance(value, value.length()); | |
#ifdef VERBOSE | |
cout << value << " " << BKNode::count << endl; | |
BKNode::count = 0; | |
#endif | |
return distance; | |
} | |
int size() const { | |
return count; | |
} | |
}; | |
/** | |
* Trims white space on the right hand side of string. Manipulates | |
* string argument. | |
*/ | |
string& rtrim(string& value) { | |
size_t last = value.find_last_not_of(" \n\r\t"); | |
if (last != string::npos) { | |
value.erase(last+1); | |
} | |
return value; | |
} | |
int main(int argc, char** argv) { | |
//ifstream file("/tmp/twl06.txt"); | |
ifstream file("/var/tmp/twl06.txt"); | |
// build tree from correct words | |
BKTree tree(""); | |
if (file.is_open() && !file.eof()) { | |
string line; | |
getline(file, line); | |
tree = BKTree(rtrim(line)); | |
while (!file.eof()) { | |
getline(file, line); | |
tree.insert(rtrim(line)); | |
} | |
file.close(); | |
} | |
ifstream input(argv[1]); | |
if (input.is_open()) { | |
vector<string> tokens; | |
// read words from input file | |
copy(istream_iterator<string>(input), istream_iterator<string>(), | |
back_inserter<vector <string> >(tokens)); | |
input.close(); | |
// trim potential \r's | |
transform(tokens.begin(), tokens.end(), tokens.begin(), rtrim); | |
// convert tokens to upper case | |
for (vector<string>::iterator i = tokens.begin(); i != tokens.end(); i++) { | |
transform((*i).begin(), (*i).end(), (*i).begin(), (int(*)(int))toupper); | |
} | |
// mapp to distances from tree | |
vector<int> distances; | |
transform(tokens.begin(), tokens.end(), back_inserter<vector <int> >(distances), tree); | |
// sum distances | |
cout << accumulate(distances.begin(), distances.end(), 0) << endl; | |
} | |
} | |
/* | |
* Local Variables: | |
* compile-command: "g++ -O3 -o breathalyzer breathalyzer.cpp" | |
* End: | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment