Last active
July 19, 2018 09:56
-
-
Save kumailxp/b728edeaf79753ed227e7eb0455cd3fe to your computer and use it in GitHub Desktop.
calculate levenschtein distance and find possible match in vector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<iostream> | |
#include<vector> | |
#include<map> | |
#include<algorithm> | |
#include<iterator> | |
#include <fstream> | |
#include <cstdlib> | |
#include <chrono> | |
using namespace std; | |
class Timer | |
{ | |
// make things readable | |
using clk = std::chrono::steady_clock; | |
clk::time_point b; // begin | |
clk::time_point e; // end | |
public: | |
void clear() { b = e = clk::now(); } | |
void start() { b = clk::now(); } | |
void stop() { e = clk::now(); } | |
friend std::ostream& operator<<(std::ostream& o, const Timer& timer) | |
{ | |
return o << timer.secs(); | |
} | |
// return time difference in seconds | |
double secs() const | |
{ | |
if(e <= b) | |
return 0.0; | |
auto d = std::chrono::duration_cast<std::chrono::microseconds>(e - b); | |
return d.count() / 1000000.0; | |
} | |
}; | |
// len_s and len_t are the number of characters in string s and t respectively | |
unsigned int LevenshteinDistance(string s, unsigned int len_s, string t, unsigned int len_t) | |
{ | |
/* base case: empty strings */ | |
if (len_s == 0) return len_t; | |
if (len_t == 0) return len_s; | |
/* test if last characters of the strings match */ | |
int cost = (s[len_s-1] == t[len_t-1]) ? 0 : 1; | |
/* return minimum of delete char from s, delete char from t, and delete char from both */ | |
return min(LevenshteinDistance(s, len_s - 1, t, len_t ) + 1, | |
min(LevenshteinDistance(s, len_s , t, len_t - 1) + 1, | |
LevenshteinDistance(s, len_s - 1, t, len_t - 1) + cost)); | |
} | |
// len_s and len_t are the number of characters in string s and t respectively | |
unsigned int LevenshteinDistance_optmized(string s, unsigned int len_s, string t, unsigned int len_t) | |
{ | |
/* base case: empty strings */ | |
if (len_s == 0) return len_t; | |
if (len_t == 0) return len_s; | |
len_s = s.size(); | |
len_t = t.size(); | |
unsigned int d[len_s][len_t] = {0}; | |
for(unsigned int i = 1; i < len_s; i++) { | |
d[i][0] = i; | |
} | |
for(unsigned int j = 1; j < len_t; j++) { | |
d[0][j] = j; | |
} | |
unsigned int subCost = 0; | |
for(unsigned j = 1; j < len_t; j++) { | |
for(unsigned int i = 1; i < len_s; i++) { | |
if(s.at(i) == t.at(j)) | |
subCost = 0; | |
else | |
subCost = 1; | |
d[i][j] = min(d[i-1][j] + 1, min(d[i][j-1] + 1, d[i-1][j-1] + subCost)); | |
} | |
} | |
return d[len_s-1][len_t-1]; | |
} | |
static bool validateUserInput(string input) { | |
unsigned int i = 0; | |
while(i < input.size()) { | |
if(!isalnum(input.at(i))) { | |
cout << "invalid characters!" << endl; | |
return false; | |
} | |
i++; | |
} | |
if(input.empty()) | |
return false; | |
return true; | |
} | |
static void loadWordListFile(const string filename, vector<string>& listofWords) { | |
ifstream fin(filename.c_str()); | |
if(!fin) { | |
cout << "error: can\'t open the file!" << endl; | |
abort(); | |
} | |
string data; | |
while(getline(fin, data)) { | |
listofWords.emplace_back(data); | |
} | |
} | |
int main() { | |
//cout << "Lev distance: " | |
//<< LevenshteinDistance("Kitten", 6, "sitting", 7) << endl; | |
// create a vector of random | |
//vector<string> listofWords = {"hello", "might", "power", "wonder"}; | |
vector<string> listofWords; | |
loadWordListFile("words.txt", listofWords); | |
//cout << "Select one of the following words:" << endl; | |
//copy(listofWords.begin(), listofWords.end(), ostream_iterator<string>(cout, "\n")); | |
string userChoice; | |
cout << endl << endl <<"please type a word: " << endl; | |
if (!std::getline(std::cin, userChoice)) { | |
cout << "Invalid input" << endl; | |
return -1; | |
} | |
if(!validateUserInput(userChoice)) { | |
cout << "invalid input" << endl; | |
return -1; | |
} | |
if (find (listofWords.begin(), listofWords.end(), userChoice) != listofWords.end()) { | |
cout << "You entered an exactly correct word" << endl; | |
return 0; | |
} | |
Timer timer; | |
vector < pair<string, unsigned int> >lvDistMap; | |
timer.start(); | |
transform(listofWords.begin(), listofWords.end(), | |
back_inserter(lvDistMap), [userChoice](string I) { | |
unsigned int lvDist = | |
LevenshteinDistance_optmized(userChoice, userChoice.size(), I, I.size()); | |
return make_pair(I, lvDist); | |
}); | |
timer.stop(); | |
cout << "Time taken to find Levenshtein distance: " << timer << " secs" << endl; | |
cout << "====================================" << endl; | |
#if 0 | |
cout << "==============================" << endl; | |
cout << "Sort and print the whole table" << endl; | |
cout << "==============================" << endl; | |
sort(lvDistMap.begin(), lvDistMap.end(), | |
[](const pair<string, unsigned int>& lhs, | |
const pair<string, unsigned int>& rhs) { | |
return lhs.second < rhs.second; | |
}); | |
for(auto& I : lvDistMap) { | |
cout << "\'" << I.first << "\' has a Levenshtein distance of: " << I.second | |
<< " relative to \'" << userChoice << "\'" << endl; | |
} | |
cout << "==============================" << endl; | |
cout << "print minimum distance only" << endl; | |
cout << "==============================" << endl; | |
#endif | |
auto result = min_element(lvDistMap.begin(), lvDistMap.end(), | |
[](const pair<string, unsigned int>& lhs, | |
const pair<string, unsigned int>& rhs) { | |
return lhs.second < rhs.second; | |
}); | |
if(result->second <= 5) | |
cout << "Perhaps you wanted to type \'" << result->first << "\'" << endl; | |
else | |
cout << "Can't find a close match" << endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment