Created
November 14, 2013 15:15
-
-
Save richardkundl/7468541 to your computer and use it in GitHub Desktop.
Get Sift3 distance from two words. Simmilar to the Levenshtein distance, but it's four time faster. Verifying that(http://norvig.com/ngrams/) data sets.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// Get Sift3 distance from two words. | |
/// (eg: spell checking) | |
/// </summary> | |
/// <param name="source"> source string </param> | |
/// <param name="actual"> actual string </param> | |
/// <param name="maxOffset"> max offset </param> | |
/// <returns> The distance of the two words. </returns> | |
public static float GetSift3Distance(string source, string actual, int maxOffset) | |
{ | |
if (string.IsNullOrEmpty(source)) | |
{ | |
return string.IsNullOrEmpty(actual) ? 0 : actual.Length; | |
} | |
if (string.IsNullOrEmpty(actual)) | |
{ | |
return source.Length; | |
} | |
int c = 0; | |
int offset1 = 0; | |
int offset2 = 0; | |
int lcs = 0; | |
while ((c + offset1 < source.Length) && (c + offset2 < actual.Length)) | |
{ | |
if (source[c + offset1] == actual[c + offset2]) | |
{ | |
lcs++; | |
} | |
else | |
{ | |
offset1 = 0; | |
offset2 = 0; | |
for (int i = 0; i < maxOffset; i++) | |
{ | |
if ((c + i < source.Length) && (source[c + i] == actual[c])) | |
{ | |
offset1 = i; | |
break; | |
} | |
if ((c + i < actual.Length) && (source[c] == actual[c + i])) | |
{ | |
offset2 = i; | |
break; | |
} | |
} | |
} | |
c++; | |
} | |
return ((source.Length + actual.Length) / 2) - lcs; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment