Skip to content

Instantly share code, notes, and snippets.

@fberger
Created March 6, 2010 20:14
Show Gist options
  • Save fberger/323911 to your computer and use it in GitHub Desktop.
Save fberger/323911 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <fstream>
#include <algorithm>
#include <map>
#include <vector>
#include <iterator>
#include <numeric>
#include <string.h>
using namespace std;
class BKNode;
/**
* Implementation of levenshtein distance that avoids any heap allocations and
* only needs O(2 * min(length(one), length(two))) space.
*/
int levenshtein(const string& one, const string& two) {
if (one.length() > two.length()) {
return levenshtein(two, one);
}
int length1 = one.length() + 1;
int length2 = two.length() + 1;
int data1[length1];
int data2[length1];
int* current = data1;
int* previous = data2;
memset(data1, 0, length1 * sizeof(int));
for (int i = 0; i < length1; i++) {
data2[i] = i;
}
for (int i = 1; i < length2; i++) {
if (i > 1) {
memset(previous, 0, length1 * sizeof(int));
int* tmp = previous;
previous = current;
current = tmp;
}
current[0] = i;
for (int j = 1; j < length1; j++) {
current[j] = min(previous[j] + 1, min(current[j-1] + 1, previous[j-1] +
(one[j-1] != two[i-1] ? 1 : 0)));
}
}
return current[length1-1];
}
/**
* Node structure of the BK-tree.
*/
class BKNode {
private:
string value;
map<int, BKNode> children;
static int count;
public:
BKNode(const string& value)
: value(value) {
}
BKNode() {
}
/**
* Inserts a value into the tree.
*/
void insert(const string& value) {
int distance = levenshtein(this->value, value);
if (distance == 0) {
return;
}
map<int,BKNode>::const_iterator i(children.find(distance));
if (i != children.end()) {
// use [] operator to avoid copy constructor
children[distance].insert(value);
} else {
// only copy constructor call of BKNode
children[distance] = BKNode(value);
}
}
/**
* Computes minimum distance between value and this node or its children.
*/
int minimumTreeDistance(const string& value, int currentMin) {
++count; // count invocations
int distance = levenshtein(this->value, value);
if (distance == 0 || children.empty()) {
return distance;
}
currentMin = min(distance, currentMin);
map<int,BKNode>::const_iterator i(children.find(distance));
if (i != children.end()) {
currentMin = min(children[distance].minimumTreeDistance(value, currentMin),
currentMin);
}
if (currentMin <= 1) {
return currentMin;
}
for (int i = distance - currentMin + 1; i < distance + currentMin; ++i) {
if (i != distance && children.find(i) != children.end()) {
currentMin = min(children[i].minimumTreeDistance(value, currentMin),
currentMin);
if (currentMin <= 1) {
return currentMin;
}
}
}
return currentMin;
}
};
int BKNode::count = 0;
/**
* Implementation of a Burkhard Keller tree for efficient retrieval
* of string values based on the Levenshtein distance as metric.
*
* Most logic is contained in BKNode class.
*/
class BKTree : public unary_function<int, const string&> {
private:
BKNode root;
int count;
public:
BKTree(const string& rootValue)
: root(BKNode(rootValue)) {
}
void insert(const string& value) {
root.insert(value);
++count;
}
/**
* Computes minimal distance between value and any value in the BK-tree.
*/
int operator() (const string& value) {
int distance = root.minimumTreeDistance(value, value.length());
#ifdef VERBOSE
cout << value << " " << BKNode::count << endl;
BKNode::count = 0;
#endif
return distance;
}
int size() const {
return count;
}
};
/**
* Trims white space on the right hand side of string. Manipulates
* string argument.
*/
string& rtrim(string& value) {
size_t last = value.find_last_not_of(" \n\r\t");
if (last != string::npos) {
value.erase(last+1);
}
return value;
}
int main(int argc, char** argv) {
//ifstream file("/tmp/twl06.txt");
ifstream file("/var/tmp/twl06.txt");
// build tree from correct words
BKTree tree("");
if (file.is_open() && !file.eof()) {
string line;
getline(file, line);
tree = BKTree(rtrim(line));
while (!file.eof()) {
getline(file, line);
tree.insert(rtrim(line));
}
file.close();
}
ifstream input(argv[1]);
if (input.is_open()) {
vector<string> tokens;
// read words from input file
copy(istream_iterator<string>(input), istream_iterator<string>(),
back_inserter<vector <string> >(tokens));
input.close();
// trim potential \r's
transform(tokens.begin(), tokens.end(), tokens.begin(), rtrim);
// convert tokens to upper case
for (vector<string>::iterator i = tokens.begin(); i != tokens.end(); i++) {
transform((*i).begin(), (*i).end(), (*i).begin(), (int(*)(int))toupper);
}
// mapp to distances from tree
vector<int> distances;
transform(tokens.begin(), tokens.end(), back_inserter<vector <int> >(distances), tree);
// sum distances
cout << accumulate(distances.begin(), distances.end(), 0) << endl;
}
}
/*
* Local Variables:
* compile-command: "g++ -O3 -o breathalyzer breathalyzer.cpp"
* End:
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment