|
package com.webserveis.app.jsoneliners.core |
|
|
|
import java.util.* |
|
|
|
object SimilarityStrings { |
|
|
|
private const val VOWELS = "AEIOU" |
|
private const val FRONTV = "EIY" |
|
private const val VARSON = "CSPTG" |
|
|
|
fun metaPhone(word: String?, _maxPhonemes: Int = 4): String { |
|
|
|
var maxPhonemes = 4 |
|
if (_maxPhonemes < 4) maxPhonemes = 4 else if (_maxPhonemes > 32) maxPhonemes = 32 |
|
|
|
var hard: Boolean |
|
var str: String? = word ?: return "" |
|
|
|
if (str.isNullOrBlank()) return "" |
|
if (isInteger(str)) str = "NAN" |
|
|
|
val strLength = str.length |
|
|
|
// single character is itself |
|
if (strLength == 1) { |
|
return str.uppercase(Locale.ENGLISH) |
|
} |
|
|
|
val inwd: CharArray = str.uppercase(Locale.ENGLISH).toCharArray() |
|
val local = StringBuilder(40) // manipulate |
|
val code = StringBuilder(10) // output |
|
|
|
when (inwd[0]) { |
|
|
|
/* looking for KN, etc*/ |
|
'K', 'G', 'P' -> if (inwd[1] == 'N') { |
|
local.append(inwd, 1, inwd.size - 1) |
|
} else { |
|
local.append(inwd) |
|
} |
|
/* looking for AE */ |
|
'A' -> if (inwd[1] == 'E') { |
|
local.append(inwd, 1, inwd.size - 1) |
|
} else { |
|
local.append(inwd) |
|
} |
|
/* looking for WR or WH */ |
|
'W' -> { |
|
when { |
|
inwd[1] == 'R' -> { // WR -> R |
|
local.append(inwd, 1, inwd.size - 1) |
|
} |
|
inwd[1] == 'H' -> { |
|
local.append(inwd, 1, inwd.size - 1) |
|
local.setCharAt(0, 'W') // WH -> W |
|
} |
|
else -> { |
|
local.append(inwd) |
|
} |
|
} |
|
} |
|
/* initial X becomes S */ |
|
'X' -> { |
|
inwd[0] = 'S' |
|
local.append(inwd) |
|
} |
|
else -> { |
|
local.append(inwd) |
|
} |
|
} |
|
|
|
val wdsz = local.length |
|
var n = 0 |
|
|
|
while (code.length < maxPhonemes && n < wdsz) { // max code size of 4 works well |
|
val symb = local[n] |
|
// remove duplicate letters except C |
|
if (symb != 'C' && isPreviousChar(local, n, symb)) { |
|
n++ |
|
} else { // not dup |
|
when (symb) { |
|
'A', 'E', 'I', 'O', 'U' -> if (n == 0) { |
|
code.append(symb) |
|
} |
|
'B' -> { |
|
if (!isPreviousChar(local, n, 'M') || !isLastChar( |
|
wdsz, |
|
n |
|
) |
|
) code.append(symb) |
|
} |
|
'C' -> { |
|
/* discard if SCI, SCE or SCY */ |
|
if (!isPreviousChar(local, n, 'S') || isLastChar(wdsz, n) || FRONTV.indexOf(local[n + 1]) < 0) { |
|
if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
|
code.append('X') |
|
} else if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0) { |
|
code.append('S') |
|
} else if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk |
|
code.append('K') |
|
} else if (isNextChar(local, n, 'H')) { // detect CH |
|
if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant |
|
code.append('K') |
|
} else { |
|
code.append('X') // CHvowel -> X |
|
} |
|
} else { |
|
code.append('K') |
|
} |
|
} |
|
} |
|
'D' -> if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local[n + 2]) >= 0) { // DGE DGI DGY -> J |
|
code.append('J') |
|
n += 2 |
|
} else { |
|
code.append('T') |
|
} |
|
'G' -> { |
|
if (!isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H')) { |
|
if (isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H') || isVowel(local, n + 2)) { |
|
if (n <= 0 || !(regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) { |
|
hard = isPreviousChar(local, n, 'G') |
|
if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0 && !hard) { |
|
code.append('J') |
|
} else { |
|
code.append('K') |
|
} |
|
} |
|
} |
|
} |
|
} |
|
'H' -> { |
|
if (!isLastChar(wdsz, n)) { |
|
if (n <= 0 || VARSON.indexOf(local[n - 1]) < 0) { |
|
if (isVowel(local, n + 1)) { |
|
code.append('H') // Hvowel |
|
} |
|
} |
|
|
|
} |
|
} |
|
'F', 'J', 'L', 'M', 'N', 'R' -> code.append(symb) |
|
'K' -> if (n > 0) { // not initial |
|
if (!isPreviousChar(local, n, 'C')) { |
|
code.append(symb) |
|
} |
|
} else { |
|
code.append(symb) // initial K |
|
} |
|
'P' -> if (isNextChar(local, n, 'H')) { |
|
// PH -> F |
|
code.append('F') |
|
} else { |
|
code.append(symb) |
|
} |
|
'Q' -> code.append('K') |
|
'S' -> if (regionMatch(local, n, "SH") || |
|
regionMatch(local, n, "SIO") || |
|
regionMatch(local, n, "SIA") |
|
) { |
|
code.append('X') |
|
} else { |
|
code.append('S') |
|
} |
|
'T' -> { |
|
if (regionMatch(local, n, "TIA") || |
|
regionMatch(local, n, "TIO") |
|
) { |
|
code.append('X') |
|
|
|
} else if (regionMatch(local, n, "TCH")) { |
|
// Silent if in "TCH" |
|
} else if (regionMatch(local, n, "TH")) { |
|
// substitute numeral 0 for TH (resembles theta after all) |
|
code.append('0') |
|
} else { |
|
code.append('T') |
|
} |
|
} |
|
'V' -> code.append('F') |
|
'W', 'Y' -> if (!isLastChar(wdsz, n) && |
|
isVowel(local, n + 1) |
|
) { |
|
code.append(symb) |
|
} |
|
'X' -> { |
|
code.append('K') |
|
code.append('S') |
|
} |
|
'Z' -> code.append('S') |
|
else -> { |
|
} |
|
} |
|
n++ |
|
} // end else from symb != 'C' |
|
if (code.length > maxPhonemes) { |
|
code.setLength(maxPhonemes) |
|
} |
|
} |
|
return code.toString() |
|
} |
|
|
|
private fun isInteger(str: String?) = str?.toIntOrNull()?.let { true } ?: false |
|
|
|
private fun isVowel(string: java.lang.StringBuilder, index: Int): Boolean { |
|
return VOWELS.indexOf(string[index]) >= 0 |
|
} |
|
|
|
private fun isPreviousChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean { |
|
var matches = false |
|
if (index > 0 && |
|
index < string.length |
|
) { |
|
matches = string[index - 1] == c |
|
} |
|
return matches |
|
} |
|
|
|
private fun isNextChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean { |
|
var matches = false |
|
if (index >= 0 && |
|
index < string.length - 1 |
|
) { |
|
matches = string[index + 1] == c |
|
} |
|
return matches |
|
} |
|
|
|
private fun regionMatch(string: java.lang.StringBuilder, index: Int, test: String): Boolean { |
|
var matches = false |
|
if (index >= 0 && |
|
index + test.length - 1 < string.length |
|
) { |
|
val substring = string.substring(index, index + test.length) |
|
matches = substring == test |
|
} |
|
return matches |
|
} |
|
|
|
private fun isLastChar(wdsz: Int, n: Int): Boolean { |
|
return n + 1 == wdsz |
|
} |
|
|
|
fun compareDoubleMetaphone(str1: String?, str2: String?, maxPhonemes: Int = 4): Boolean = metaPhone(str1, maxPhonemes) == metaPhone(str2, maxPhonemes) |
|
|
|
|
|
} |