Created
June 10, 2015 20:17
-
-
Save freemo/c13f91b45658ef229243 to your computer and use it in GitHub Desktop.
A utility class that uses various methods to determine the similarity between two strings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.lang3.StringUtils; | |
import java.util.ArrayList; | |
import java.util.List; | |
public final class StringSimilarityUtils { | |
private StringSimilarityUtils() { | |
throw new IllegalStateException("Should not be able to instantiate this class"); | |
} | |
public static int levenshteinDistance(final String original, final String modified) { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
return StringUtils.getLevenshteinDistance(original, modified); | |
} | |
public static int levenshteinDistance(final String original, final String modified, final int threshold) throws ThresholdExceeded { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
if(threshold < 0) | |
throw new IllegalArgumentException("threshold can not be a negative number"); | |
int retVal = StringUtils.getLevenshteinDistance(original, modified, threshold); | |
if(retVal < 0) | |
throw new ThresholdExceeded("Distance exceeded threshold limit"); | |
return retVal; | |
} | |
public static double levenshteinDistanceRatio(final String original, final String modified) { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
return levenshteinDistance(original, modified) / ((double)original.length()); | |
} | |
public static double levenshteinDistanceRatio(final String original, final String modified, final double threshold) throws ThresholdExceeded { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
final int absoluteThreshold = Double.valueOf(Math.ceil(original.length() * threshold)).intValue(); | |
final int distance = levenshteinDistance(original, modified, absoluteThreshold); | |
return distance / ((double)original.length()); | |
} | |
public static boolean isSimilarByWordGroup(final String original, final String modified, final int wordGroupThreshold) { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
if(wordGroupThreshold <0) | |
throw new IllegalArgumentException("wordGroupThreshold can not be a negative value"); | |
//Since the Levenshtein distance between the two is above the threshold value lets compare word sequences. If a | |
//word sequence containing the number of words int he threshold (or more) appear in the original then it will | |
//be considered a derivative. | |
final String originalStripped = original.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase(); | |
final String[] modifiedWords = modified.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase().split(" "); | |
final List<String> modifiedWordGroups = new ArrayList<>(); | |
for(int wordIndex = 0; wordIndex < modifiedWords.length - (wordGroupThreshold - 1); wordIndex++) { | |
final StringBuilder wordGroup = new StringBuilder(modifiedWords[wordIndex]); | |
for(int wordCount = 0; wordCount < (wordGroupThreshold - 1); wordCount++) { | |
wordGroup.append(" ").append(modifiedWords[wordIndex+wordCount+1]); | |
} | |
modifiedWordGroups.add(wordGroup.toString()); | |
} | |
for(String modifiedWordGroup : modifiedWordGroups) | |
if(originalStripped.contains(modifiedWordGroup)) | |
return true; | |
return false; | |
} | |
/** | |
* Determines how similar two strings are using the Levenshtein distance between the two and comparing it | |
* against a threshold value, as well as checking contigous word blocks. | |
*/ | |
public static boolean isSimilarByDistanceAndWordGroup(final String original, final String modified, final double distanceThreshold, final int wordGroupThreshold) { | |
if(original == null) | |
throw new IllegalArgumentException("original can not be null"); | |
if(modified == null) | |
throw new IllegalArgumentException("modified can not be null"); | |
if(wordGroupThreshold <0) | |
throw new IllegalArgumentException("wordGroupThreshold can not be a negative value"); | |
if((distanceThreshold < 0d) || (distanceThreshold > 1d) ) | |
throw new IllegalArgumentException("distanceThreshold must be a value between 0.0 and 1.0 inclusive"); | |
//First lets use Levenshtein distance to calculate the distance (similarlity) between the two strings. A 0 | |
//inidcates they are identical, a number greater than 0 inidicates the number of edits needed to convert one | |
//string into the other. This is then converted into a percentage of the total number of characters in the | |
//original string. If this number is less thant he threshold then we can simply assume it is a derivative and | |
//return. Otherwise we need to resoort to a more rigerous comparison. | |
try { | |
levenshteinDistanceRatio(original, modified, distanceThreshold); | |
} catch (ThresholdExceeded thresholdExceeded) { | |
return isSimilarByWordGroup(original, modified, wordGroupThreshold); | |
} | |
return true; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class ThresholdExceeded extends Exception { | |
public ThresholdExceeded() { | |
} | |
public ThresholdExceeded(String s) { | |
super(s); | |
} | |
public ThresholdExceeded(String s, Throwable throwable) { | |
super(s, throwable); | |
} | |
public ThresholdExceeded(Throwable throwable) { | |
super(throwable); | |
} | |
public ThresholdExceeded(String s, Throwable throwable, boolean b, boolean b1) { | |
super(s, throwable, b, b1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment