Last active
December 20, 2015 22:09
-
-
Save PaulBGD/6202963 to your computer and use it in GitHub Desktop.
Simple Fuzzy Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public String findMatch(String arg, List<String> words, boolean ignoreRepeat) { | |
// Where the final word will be stored. | |
String word = null; | |
/* | |
* Sometimes two words will equally match (rarely). If we find one that matches, we | |
* increase this by one. Just in case the person wants it to return null, we have the | |
* ignoreRepeat. | |
*/ | |
int repeated = 0; | |
for (String s : words) { | |
/* | |
* The magic behind this all. getLevenshteinDistance() compares two strings and | |
* returns how many different letters they have. So like fall and fell would | |
* return 1. The lower the number, the closer the match. | |
*/ | |
int low = getLevenshteinDistance(s, arg); | |
// We have to have something to compare to | |
int lowest = getLevenshteinDistance(word, arg); | |
/* | |
* If this word is lower than the last, set all the variables to it. We also | |
* reset the repeated, as there are no repeats for this number. | |
*/ | |
if (low < lowest) { | |
word = s; | |
repeated = 0; | |
lowest = low; | |
} else if (lowest == low) { | |
/* | |
* Now we check the lengths. Lets say you have the word "flop" and are | |
* matching it to "fly" and "op" (in a list in that order). Since they both | |
* are two letters away and op is last, it would pick op (which I believe fly | |
* would make more sense). We now compare which is the closest length to the | |
* word being checked. | |
*/ | |
int wLength = (word.length() > arg.length() ? word.length() - arg.length() : arg | |
.length() - word.length()); | |
int dLength = (s.length() > arg.length() ? s.length() - arg.length() : arg.length() | |
- s.length()); | |
if (dLength < wLength) { | |
word = s; | |
repeated = 0; | |
lowest = low; | |
} else { | |
/* | |
* Another check. This checks if the string we are checking contains the | |
* search directly. | |
*/ | |
if (s.contains(arg) && !word.contains(arg)) { | |
word = s; | |
repeated = 0; | |
lowest = low; | |
/* | |
* We switch it the other way around now. | |
*/ | |
} else if (arg.contains(s) && !arg.contains(word)) { | |
word = s; | |
repeated = 0; | |
lowest = low; | |
} else { | |
/* | |
* Worst comes to worst, this ties with another word. Just in case | |
* someone doesn't want this, we make sure to have repeated not equal to | |
* 0. | |
*/ | |
repeated++; | |
} | |
} | |
} | |
} | |
/* | |
* As I have said above, some people don't want a repeat. In this case, I return | |
* null. | |
*/ | |
if (repeated != 0 && ignoreRepeat) { | |
return null; | |
} else { | |
return word; | |
} | |
} | |
private int getLevenshteinDistance(String s, String t) { | |
if (s == null || t == null) { | |
return 0; | |
} | |
int n = s.length(); | |
int m = t.length(); | |
if (n == 0) { | |
return m; | |
} else if (m == 0) { | |
return n; | |
} | |
if (n > m) { | |
String tmp = s; | |
s = t; | |
t = tmp; | |
n = m; | |
m = t.length(); | |
} | |
int p[] = new int[n + 1]; | |
int d[] = new int[n + 1]; | |
int _d[]; | |
int i, j, cost; | |
char t_j; | |
for (i = 0; i <= n; i++) { | |
p[i] = i; | |
} | |
for (j = 1; j <= m; j++) { | |
t_j = t.charAt(j - 1); | |
d[0] = j; | |
for (i = 1; i <= n; i++) { | |
cost = s.charAt(i - 1) == t_j ? 0 : 1; | |
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); | |
} | |
_d = p; | |
p = d; | |
d = _d; | |
} | |
return p[n]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment