Skip to content

Instantly share code, notes, and snippets.

@webserveis
Last active February 8, 2022 10:50
Show Gist options
  • Save webserveis/a599ba49cdb747ac3cfbc5959ef6815f to your computer and use it in GitHub Desktop.
Save webserveis/a599ba49cdb747ac3cfbc5959ef6815f to your computer and use it in GitHub Desktop.
String similary algoritms

https://github.com/tdebatty/java-string-similarity

metaphone

val pairsList: Array<Pair<String, String>> = arrayOf(
    "My String" to "my string", //comparison is case-insensitive
    "judge" to "juge", //typo
    "knock" to "nock", //silent letters
    "white" to "wite", //missing letters
    "record" to "record", //two different words in English but match the same
    "pair" to "pear", //	these match but are different words.
    "bookkeeper" to "book keeper", //spaces cause failures in comparison
    "test1" to "test123", //digits are not compared
    "the end." to "the end….", //punctuation differences do not matter.
    "a elephant" to "an elephant", //a and an are treated differently.
    "color" to "kolor"
)

pairsList.forEach {
    val (str1, str2) = it

    val metaPhone1 = SimilarityStrings.metaPhone(str1)
    val metaPhone2 = SimilarityStrings.metaPhone(str2)

    val doubleMetaphone = Pair(metaPhone1, metaPhone2)

    Log.d(TAG, "similarity metaphone: " + str1 + doubleMetaphone + str2 + " (" + SimilarityStrings.compareDoubleMetaphone(str1, str2) + ")")
    Log.d(TAG, "similarity levenshtein distance: " + SimilarityStrings.getLevenshteinDistance(str1, str2))
    Log.d(TAG, "similarity Jaron Winkler distance: " + SimilarityStrings.getJaroWinklerDistance(str1, str2))

}
public class JaroWinklerScore {
/**
* Applies the Jaro-Winkler distance algorithm to the given strings, providing information about the
* similarity of them.
*
* @param s1 The first string that gets compared. May be <code>null</node> or empty.
* @param s2 The second string that gets compared. May be <code>null</node> or empty.
* @return The Jaro-Winkler score (between 0.0 and 1.0), with a higher value indicating larger similarity.
*
* @author Thomas Trojer <[email protected]>
*/
public static double compute(final String s1, final String s2) {
// lowest score on empty strings
if (s1 == null || s2 == null || s1.isEmpty() || s2.isEmpty()) {
return 0;
}
// highest score on equal strings
if (s1.equals(s2)) {
return 1;
}
// some score on different strings
int prefixMatch = 0; // exact prefix matches
int matches = 0; // matches (including prefix and ones requiring transpostion)
int transpositions = 0; // matching characters that are not aligned but close together
int maxLength = Math.max(s1.length(), s2.length());
int maxMatchDistance = Math.max((int) Math.floor(maxLength / 2.0) - 1, 0); // look-ahead/-behind to limit transposed matches
// comparison
final String shorter = s1.length() < s2.length() ? s1 : s2;
final String longer = s1.length() >= s2.length() ? s1 : s2;
for (int i = 0; i < shorter.length(); i++) {
// check for exact matches
boolean match = shorter.charAt(i) == longer.charAt(i);
if (match) {
if (i < 4) {
// prefix match (of at most 4 characters, as described by the algorithm)
prefixMatch++;
}
matches++;
continue;
}
// check fro transposed matches
for (int j = Math.max(i - maxMatchDistance, 0); j < Math.min(i + maxMatchDistance, longer.length()); j++) {
if (i == j) {
// case already covered
continue;
}
// transposition required to match?
match = shorter.charAt(i) == longer.charAt(j);
if (match) {
transpositions++;
break;
}
}
}
// any matching characters?
if (matches == 0) {
return 0;
}
// modify transpositions (according to the algorithm)
transpositions = (int) (transpositions / 2.0);
// non prefix-boosted score
double score = 0.3334 * (matches / (double) longer.length() + matches / (double) shorter.length() + (matches - transpositions)
/ (double) matches);
if (score < 0.7) {
return score;
}
// we already have a good match, hence we boost the score proportional to the common prefix
double boostedScore = score + prefixMatch * 0.1 * (1.0 - score);
return boostedScore;
}
}
fun levenshtein(lhs : CharSequence, rhs : CharSequence) : Int {
val lhsLength = lhs.length
val rhsLength = rhs.length
var cost = Array(lhsLength) { it }
var newCost = Array(lhsLength) { 0 }
for (i in 1..rhsLength-1) {
newCost[0] = i
for (j in 1..lhsLength-1) {
val match = if(lhs[j - 1] == rhs[i - 1]) 0 else 1
val costReplace = cost[j - 1] + match
val costInsert = cost[j] + 1
val costDelete = newCost[j - 1] + 1
newCost[j] = Math.min(Math.min(costInsert, costDelete), costReplace)
}
val swap = cost
cost = newCost
newCost = swap
}
return cost[lhsLength - 1]
}
package com.webserveis.app.jsoneliners.core
import java.util.*
object SimilarityStrings {
private const val VOWELS = "AEIOU"
private const val FRONTV = "EIY"
private const val VARSON = "CSPTG"
fun metaPhone(word: String?, _maxPhonemes: Int = 4): String {
var maxPhonemes = 4
if (_maxPhonemes < 4) maxPhonemes = 4 else if (_maxPhonemes > 32) maxPhonemes = 32
var hard: Boolean
var str: String? = word ?: return ""
if (str.isNullOrBlank()) return ""
if (isInteger(str)) str = "NAN"
val strLength = str.length
// single character is itself
if (strLength == 1) {
return str.uppercase(Locale.ENGLISH)
}
val inwd: CharArray = str.uppercase(Locale.ENGLISH).toCharArray()
val local = StringBuilder(40) // manipulate
val code = StringBuilder(10) // output
when (inwd[0]) {
/* looking for KN, etc*/
'K', 'G', 'P' -> if (inwd[1] == 'N') {
local.append(inwd, 1, inwd.size - 1)
} else {
local.append(inwd)
}
/* looking for AE */
'A' -> if (inwd[1] == 'E') {
local.append(inwd, 1, inwd.size - 1)
} else {
local.append(inwd)
}
/* looking for WR or WH */
'W' -> {
when {
inwd[1] == 'R' -> { // WR -> R
local.append(inwd, 1, inwd.size - 1)
}
inwd[1] == 'H' -> {
local.append(inwd, 1, inwd.size - 1)
local.setCharAt(0, 'W') // WH -> W
}
else -> {
local.append(inwd)
}
}
}
/* initial X becomes S */
'X' -> {
inwd[0] = 'S'
local.append(inwd)
}
else -> {
local.append(inwd)
}
}
val wdsz = local.length
var n = 0
while (code.length < maxPhonemes && n < wdsz) { // max code size of 4 works well
val symb = local[n]
// remove duplicate letters except C
if (symb != 'C' && isPreviousChar(local, n, symb)) {
n++
} else { // not dup
when (symb) {
'A', 'E', 'I', 'O', 'U' -> if (n == 0) {
code.append(symb)
}
'B' -> {
if (!isPreviousChar(local, n, 'M') || !isLastChar(
wdsz,
n
)
) code.append(symb)
}
'C' -> {
/* discard if SCI, SCE or SCY */
if (!isPreviousChar(local, n, 'S') || isLastChar(wdsz, n) || FRONTV.indexOf(local[n + 1]) < 0) {
if (regionMatch(local, n, "CIA")) { // "CIA" -> X
code.append('X')
} else if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0) {
code.append('S')
} else if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
code.append('K')
} else if (isNextChar(local, n, 'H')) { // detect CH
if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
code.append('K')
} else {
code.append('X') // CHvowel -> X
}
} else {
code.append('K')
}
}
}
'D' -> if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local[n + 2]) >= 0) { // DGE DGI DGY -> J
code.append('J')
n += 2
} else {
code.append('T')
}
'G' -> {
if (!isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H')) {
if (isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H') || isVowel(local, n + 2)) {
if (n <= 0 || !(regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
hard = isPreviousChar(local, n, 'G')
if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0 && !hard) {
code.append('J')
} else {
code.append('K')
}
}
}
}
}
'H' -> {
if (!isLastChar(wdsz, n)) {
if (n <= 0 || VARSON.indexOf(local[n - 1]) < 0) {
if (isVowel(local, n + 1)) {
code.append('H') // Hvowel
}
}
}
}
'F', 'J', 'L', 'M', 'N', 'R' -> code.append(symb)
'K' -> if (n > 0) { // not initial
if (!isPreviousChar(local, n, 'C')) {
code.append(symb)
}
} else {
code.append(symb) // initial K
}
'P' -> if (isNextChar(local, n, 'H')) {
// PH -> F
code.append('F')
} else {
code.append(symb)
}
'Q' -> code.append('K')
'S' -> if (regionMatch(local, n, "SH") ||
regionMatch(local, n, "SIO") ||
regionMatch(local, n, "SIA")
) {
code.append('X')
} else {
code.append('S')
}
'T' -> {
if (regionMatch(local, n, "TIA") ||
regionMatch(local, n, "TIO")
) {
code.append('X')
} else if (regionMatch(local, n, "TCH")) {
// Silent if in "TCH"
} else if (regionMatch(local, n, "TH")) {
// substitute numeral 0 for TH (resembles theta after all)
code.append('0')
} else {
code.append('T')
}
}
'V' -> code.append('F')
'W', 'Y' -> if (!isLastChar(wdsz, n) &&
isVowel(local, n + 1)
) {
code.append(symb)
}
'X' -> {
code.append('K')
code.append('S')
}
'Z' -> code.append('S')
else -> {
}
}
n++
} // end else from symb != 'C'
if (code.length > maxPhonemes) {
code.setLength(maxPhonemes)
}
}
return code.toString()
}
private fun isInteger(str: String?) = str?.toIntOrNull()?.let { true } ?: false
private fun isVowel(string: java.lang.StringBuilder, index: Int): Boolean {
return VOWELS.indexOf(string[index]) >= 0
}
private fun isPreviousChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
var matches = false
if (index > 0 &&
index < string.length
) {
matches = string[index - 1] == c
}
return matches
}
private fun isNextChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
var matches = false
if (index >= 0 &&
index < string.length - 1
) {
matches = string[index + 1] == c
}
return matches
}
private fun regionMatch(string: java.lang.StringBuilder, index: Int, test: String): Boolean {
var matches = false
if (index >= 0 &&
index + test.length - 1 < string.length
) {
val substring = string.substring(index, index + test.length)
matches = substring == test
}
return matches
}
private fun isLastChar(wdsz: Int, n: Int): Boolean {
return n + 1 == wdsz
}
fun compareDoubleMetaphone(str1: String?, str2: String?, maxPhonemes: Int = 4): Boolean = metaPhone(str1, maxPhonemes) == metaPhone(str2, maxPhonemes)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment