webserveis · February 8, 2022 10:50
diff --git a/readme.md b/readme.md
diff --git a/JaroWinklerScore.java b/JaroWinklerScore.java
 public class JaroWinklerScore {
    /**
 	 * Applies the Jaro-Winkler distance algorithm to the given strings, providing information about the
 	 * similarity of them.
 	 * 
 	 * @param s1 The first string that gets compared. May be <code>null</node> or empty.
 	 * @param s2 The second string that gets compared. May be <code>null</node> or empty.
 	 * @return The Jaro-Winkler score (between 0.0 and 1.0), with a higher value indicating larger similarity.
 	 * 
 	 * @author Thomas Trojer <[email protected]>
 	 */
 	public static double compute(final String s1, final String s2) {
 		// lowest score on empty strings
 		if (s1 == null || s2 == null || s1.isEmpty() || s2.isEmpty()) {
 			return 0;
 		}
 		// highest score on equal strings
 		if (s1.equals(s2)) {
 			return 1;
 		}
 		// some score on different strings
 		int prefixMatch = 0; // exact prefix matches
 		int matches = 0; // matches (including prefix and ones requiring transpostion)
 		int transpositions = 0; // matching characters that are not aligned but close together 
 		int maxLength = Math.max(s1.length(), s2.length());
 		int maxMatchDistance = Math.max((int) Math.floor(maxLength / 2.0) - 1, 0); // look-ahead/-behind to limit transposed matches
 		// comparison
 		final String shorter = s1.length() < s2.length() ? s1 : s2;
 		final String longer = s1.length() >= s2.length() ? s1 : s2;
 		for (int i = 0; i < shorter.length(); i++) {
 		    // check for exact matches
 			boolean match = shorter.charAt(i) == longer.charAt(i);
 			if (match) {
 				if (i < 4) {
 					// prefix match (of at most 4 characters, as described by the algorithm)
 					prefixMatch++;
 				}
 				matches++;
 				continue;
 			}
 			// check fro transposed matches
 			for (int j = Math.max(i - maxMatchDistance, 0); j < Math.min(i + maxMatchDistance, longer.length()); j++) {
 				if (i == j) {
 					// case already covered
 					continue;
 				}
 				// transposition required to match?
 				match = shorter.charAt(i) == longer.charAt(j);
 				if (match) {
 					transpositions++;
 					break;
 				}
 			}
 		}
 		// any matching characters?
 		if (matches == 0) {
 			return 0;
 		}
 		// modify transpositions (according to the algorithm)
 		transpositions = (int) (transpositions / 2.0);
 		// non prefix-boosted score
 		double score = 0.3334 * (matches / (double) longer.length() + matches / (double) shorter.length() + (matches - transpositions)
 				/ (double) matches);
 		if (score < 0.7) {
 			return score;
 		}
 		// we already have a good match, hence we boost the score proportional to the common prefix
 		double boostedScore = score + prefixMatch * 0.1 * (1.0 - score);
 		return boostedScore;
 	}
 }

diff --git a/levenshtein.kt b/levenshtein.kt
 fun levenshtein(lhs : CharSequence, rhs : CharSequence) : Int {
    val lhsLength = lhs.length
    val rhsLength = rhs.length

    var cost = Array(lhsLength) { it }
    var newCost = Array(lhsLength) { 0 }

    for (i in 1..rhsLength-1) {
        newCost[0] = i

        for (j in 1..lhsLength-1) {
            val match = if(lhs[j - 1] == rhs[i - 1]) 0 else 1

            val costReplace = cost[j - 1] + match
            val costInsert = cost[j] + 1
            val costDelete = newCost[j - 1] + 1

            newCost[j] = Math.min(Math.min(costInsert, costDelete), costReplace)
        }

        val swap = cost
        cost = newCost
        newCost = swap
    }

    return cost[lhsLength - 1]
 }
diff --git a/SimilarityStrings.kt b/SimilarityStrings.kt
 package com.webserveis.app.jsoneliners.core

 import java.util.*

 object SimilarityStrings {

    private const val VOWELS = "AEIOU"
    private const val FRONTV = "EIY"
    private const val VARSON = "CSPTG"

    fun metaPhone(word: String?, _maxPhonemes: Int = 4): String {

        var maxPhonemes = 4
        if (_maxPhonemes < 4) maxPhonemes = 4 else if (_maxPhonemes > 32) maxPhonemes = 32

        var hard: Boolean
        var str: String? = word ?: return ""

        if (str.isNullOrBlank()) return ""
        if (isInteger(str)) str = "NAN"

        val strLength = str.length

        // single character is itself
        if (strLength == 1) {
            return str.uppercase(Locale.ENGLISH)
        }

        val inwd: CharArray = str.uppercase(Locale.ENGLISH).toCharArray()
        val local = StringBuilder(40) // manipulate
        val code = StringBuilder(10) //   output

        when (inwd[0]) {

            /* looking for KN, etc*/
            'K', 'G', 'P' -> if (inwd[1] == 'N') {
                local.append(inwd, 1, inwd.size - 1)
            } else {
                local.append(inwd)
            }
            /* looking for AE */
            'A' -> if (inwd[1] == 'E') {
                local.append(inwd, 1, inwd.size - 1)
            } else {
                local.append(inwd)
            }
            /* looking for WR or WH */
            'W' -> {
                when {
                    inwd[1] == 'R' -> {   // WR -> R
                        local.append(inwd, 1, inwd.size - 1)
                    }
                    inwd[1] == 'H' -> {
                        local.append(inwd, 1, inwd.size - 1)
                        local.setCharAt(0, 'W') // WH -> W
                    }
                    else -> {
                        local.append(inwd)
                    }
                }
            }
            /* initial X becomes S */
            'X' -> {
                inwd[0] = 'S'
                local.append(inwd)
            }
            else -> {
                local.append(inwd)
            }
        }

        val wdsz = local.length
        var n = 0

        while (code.length < maxPhonemes && n < wdsz) { // max code size of 4 works well
            val symb = local[n]
            // remove duplicate letters except C
            if (symb != 'C' && isPreviousChar(local, n, symb)) {
                n++
            } else { // not dup
                when (symb) {
                    'A', 'E', 'I', 'O', 'U' -> if (n == 0) {
                        code.append(symb)
                    }
                    'B' -> {
                        if (!isPreviousChar(local, n, 'M') || !isLastChar(
                                wdsz,
                                n
                            )
                        ) code.append(symb)
                    }
                    'C' -> {
                        /* discard if SCI, SCE or SCY */
                        if (!isPreviousChar(local, n, 'S') || isLastChar(wdsz, n) || FRONTV.indexOf(local[n + 1]) < 0) {
                            if (regionMatch(local, n, "CIA")) { // "CIA" -> X
                                code.append('X')
                            } else if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0) {
                                code.append('S')
                            } else if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
                                code.append('K')
                            } else if (isNextChar(local, n, 'H')) { // detect CH
                                if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
                                    code.append('K')
                                } else {
                                    code.append('X') // CHvowel -> X
                                }
                            } else {
                                code.append('K')
                            }
                        }
                    }
                    'D' -> if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local[n + 2]) >= 0) { // DGE DGI DGY -> J
                        code.append('J')
                        n += 2
                    } else {
                        code.append('T')
                    }
                    'G' -> {
                        if (!isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H')) {
                            if (isLastChar(wdsz, n + 1) || !isNextChar(local, n, 'H') || isVowel(local, n + 2)) {
                                if (n <= 0 || !(regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
                                    hard = isPreviousChar(local, n, 'G')
                                    if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0 && !hard) {
                                        code.append('J')
                                    } else {
                                        code.append('K')
                                    }
                                }
                            }
                        }
                    }
                    'H' -> {
                        if (!isLastChar(wdsz, n)) {
                            if (n <= 0 || VARSON.indexOf(local[n - 1]) < 0) {
                                if (isVowel(local, n + 1)) {
                                    code.append('H') // Hvowel
                                }
                            }

                        }
                    }
                    'F', 'J', 'L', 'M', 'N', 'R' -> code.append(symb)
                    'K' -> if (n > 0) { // not initial
                        if (!isPreviousChar(local, n, 'C')) {
                            code.append(symb)
                        }
                    } else {
                        code.append(symb) // initial K
                    }
                    'P' -> if (isNextChar(local, n, 'H')) {
                        // PH -> F
                        code.append('F')
                    } else {
                        code.append(symb)
                    }
                    'Q' -> code.append('K')
                    'S' -> if (regionMatch(local, n, "SH") ||
                        regionMatch(local, n, "SIO") ||
                        regionMatch(local, n, "SIA")
                    ) {
                        code.append('X')
                    } else {
                        code.append('S')
                    }
                    'T' -> {
                        if (regionMatch(local, n, "TIA") ||
                            regionMatch(local, n, "TIO")
                        ) {
                            code.append('X')

                        } else if (regionMatch(local, n, "TCH")) {
                            // Silent if in "TCH"
                        } else if (regionMatch(local, n, "TH")) {
                            // substitute numeral 0 for TH (resembles theta after all)
                            code.append('0')
                        } else {
                            code.append('T')
                        }
                    }
                    'V' -> code.append('F')
                    'W', 'Y' -> if (!isLastChar(wdsz, n) &&
                        isVowel(local, n + 1)
                    ) {
                        code.append(symb)
                    }
                    'X' -> {
                        code.append('K')
                        code.append('S')
                    }
                    'Z' -> code.append('S')
                    else -> {
                    }
                }
                n++
            } // end else from symb != 'C'
            if (code.length > maxPhonemes) {
                code.setLength(maxPhonemes)
            }
        }
        return code.toString()
    }

    private fun isInteger(str: String?) = str?.toIntOrNull()?.let { true } ?: false

    private fun isVowel(string: java.lang.StringBuilder, index: Int): Boolean {
        return VOWELS.indexOf(string[index]) >= 0
    }

    private fun isPreviousChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
        var matches = false
        if (index > 0 &&
            index < string.length
        ) {
            matches = string[index - 1] == c
        }
        return matches
    }

    private fun isNextChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
        var matches = false
        if (index >= 0 &&
            index < string.length - 1
        ) {
            matches = string[index + 1] == c
        }
        return matches
    }

    private fun regionMatch(string: java.lang.StringBuilder, index: Int, test: String): Boolean {
        var matches = false
        if (index >= 0 &&
            index + test.length - 1 < string.length
        ) {
            val substring = string.substring(index, index + test.length)
            matches = substring == test
        }
        return matches
    }

    private fun isLastChar(wdsz: Int, n: Int): Boolean {
        return n + 1 == wdsz
    }

    fun compareDoubleMetaphone(str1: String?, str2: String?, maxPhonemes: Int = 4): Boolean = metaPhone(str1, maxPhonemes) == metaPhone(str2, maxPhonemes)


 }
	public class JaroWinklerScore {
	/**
	* Applies the Jaro-Winkler distance algorithm to the given strings, providing information about the
	* similarity of them.
	*
	* @param s1 The first string that gets compared. May be <code>null</node> or empty.
	* @param s2 The second string that gets compared. May be <code>null</node> or empty.
	* @return The Jaro-Winkler score (between 0.0 and 1.0), with a higher value indicating larger similarity.
	*
	* @author Thomas Trojer <[email protected]>
	*/
	public static double compute(final String s1, final String s2) {
	// lowest score on empty strings
	if (s1 == null \|\| s2 == null \|\| s1.isEmpty() \|\| s2.isEmpty()) {
	return 0;
	}
	// highest score on equal strings
	if (s1.equals(s2)) {
	return 1;
	}
	// some score on different strings
	int prefixMatch = 0; // exact prefix matches
	int matches = 0; // matches (including prefix and ones requiring transpostion)
	int transpositions = 0; // matching characters that are not aligned but close together
	int maxLength = Math.max(s1.length(), s2.length());
	int maxMatchDistance = Math.max((int) Math.floor(maxLength / 2.0) - 1, 0); // look-ahead/-behind to limit transposed matches
	// comparison
	final String shorter = s1.length() < s2.length() ? s1 : s2;
	final String longer = s1.length() >= s2.length() ? s1 : s2;
	for (int i = 0; i < shorter.length(); i++) {
	// check for exact matches
	boolean match = shorter.charAt(i) == longer.charAt(i);
	if (match) {
	if (i < 4) {
	// prefix match (of at most 4 characters, as described by the algorithm)
	prefixMatch++;
	}
	matches++;
	continue;
	}
	// check fro transposed matches
	for (int j = Math.max(i - maxMatchDistance, 0); j < Math.min(i + maxMatchDistance, longer.length()); j++) {
	if (i == j) {
	// case already covered
	continue;
	}
	// transposition required to match?
	match = shorter.charAt(i) == longer.charAt(j);
	if (match) {
	transpositions++;
	break;
	}
	}
	}
	// any matching characters?
	if (matches == 0) {
	return 0;
	}
	// modify transpositions (according to the algorithm)
	transpositions = (int) (transpositions / 2.0);
	// non prefix-boosted score
	double score = 0.3334 * (matches / (double) longer.length() + matches / (double) shorter.length() + (matches - transpositions)
	/ (double) matches);
	if (score < 0.7) {
	return score;
	}
	// we already have a good match, hence we boost the score proportional to the common prefix
	double boostedScore = score + prefixMatch * 0.1 * (1.0 - score);
	return boostedScore;
	}
	}
	fun levenshtein(lhs : CharSequence, rhs : CharSequence) : Int {
	val lhsLength = lhs.length
	val rhsLength = rhs.length

	var cost = Array(lhsLength) { it }
	var newCost = Array(lhsLength) { 0 }

	for (i in 1..rhsLength-1) {
	newCost[0] = i

	for (j in 1..lhsLength-1) {
	val match = if(lhs[j - 1] == rhs[i - 1]) 0 else 1

	val costReplace = cost[j - 1] + match
	val costInsert = cost[j] + 1
	val costDelete = newCost[j - 1] + 1

	newCost[j] = Math.min(Math.min(costInsert, costDelete), costReplace)
	}

	val swap = cost
	cost = newCost
	newCost = swap
	}

	return cost[lhsLength - 1]
	}
	package com.webserveis.app.jsoneliners.core

	import java.util.*

	object SimilarityStrings {

	private const val VOWELS = "AEIOU"
	private const val FRONTV = "EIY"
	private const val VARSON = "CSPTG"

	fun metaPhone(word: String?, _maxPhonemes: Int = 4): String {

	var maxPhonemes = 4
	if (_maxPhonemes < 4) maxPhonemes = 4 else if (_maxPhonemes > 32) maxPhonemes = 32

	var hard: Boolean
	var str: String? = word ?: return ""

	if (str.isNullOrBlank()) return ""
	if (isInteger(str)) str = "NAN"

	val strLength = str.length

	// single character is itself
	if (strLength == 1) {
	return str.uppercase(Locale.ENGLISH)
	}

	val inwd: CharArray = str.uppercase(Locale.ENGLISH).toCharArray()
	val local = StringBuilder(40) // manipulate
	val code = StringBuilder(10) // output

	when (inwd[0]) {

	/* looking for KN, etc*/
	'K', 'G', 'P' -> if (inwd[1] == 'N') {
	local.append(inwd, 1, inwd.size - 1)
	} else {
	local.append(inwd)
	}
	/* looking for AE */
	'A' -> if (inwd[1] == 'E') {
	local.append(inwd, 1, inwd.size - 1)
	} else {
	local.append(inwd)
	}
	/* looking for WR or WH */
	'W' -> {
	when {
	inwd[1] == 'R' -> { // WR -> R
	local.append(inwd, 1, inwd.size - 1)
	}
	inwd[1] == 'H' -> {
	local.append(inwd, 1, inwd.size - 1)
	local.setCharAt(0, 'W') // WH -> W
	}
	else -> {
	local.append(inwd)
	}
	}
	}
	/* initial X becomes S */
	'X' -> {
	inwd[0] = 'S'
	local.append(inwd)
	}
	else -> {
	local.append(inwd)
	}
	}

	val wdsz = local.length
	var n = 0

	while (code.length < maxPhonemes && n < wdsz) { // max code size of 4 works well
	val symb = local[n]
	// remove duplicate letters except C
	if (symb != 'C' && isPreviousChar(local, n, symb)) {
	n++
	} else { // not dup
	when (symb) {
	'A', 'E', 'I', 'O', 'U' -> if (n == 0) {
	code.append(symb)
	}
	'B' -> {
	if (!isPreviousChar(local, n, 'M') \|\| !isLastChar(
	wdsz,
	n
	)
	) code.append(symb)
	}
	'C' -> {
	/* discard if SCI, SCE or SCY */
	if (!isPreviousChar(local, n, 'S') \|\| isLastChar(wdsz, n) \|\| FRONTV.indexOf(local[n + 1]) < 0) {
	if (regionMatch(local, n, "CIA")) { // "CIA" -> X
	code.append('X')
	} else if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0) {
	code.append('S')
	} else if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk
	code.append('K')
	} else if (isNextChar(local, n, 'H')) { // detect CH
	if (n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant
	code.append('K')
	} else {
	code.append('X') // CHvowel -> X
	}
	} else {
	code.append('K')
	}
	}
	}
	'D' -> if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local[n + 2]) >= 0) { // DGE DGI DGY -> J
	code.append('J')
	n += 2
	} else {
	code.append('T')
	}
	'G' -> {
	if (!isLastChar(wdsz, n + 1) \|\| !isNextChar(local, n, 'H')) {
	if (isLastChar(wdsz, n + 1) \|\| !isNextChar(local, n, 'H') \|\| isVowel(local, n + 2)) {
	if (n <= 0 \|\| !(regionMatch(local, n, "GN") \|\| regionMatch(local, n, "GNED"))) {
	hard = isPreviousChar(local, n, 'G')
	if (!isLastChar(wdsz, n) && FRONTV.indexOf(local[n + 1]) >= 0 && !hard) {
	code.append('J')
	} else {
	code.append('K')
	}
	}
	}
	}
	}
	'H' -> {
	if (!isLastChar(wdsz, n)) {
	if (n <= 0 \|\| VARSON.indexOf(local[n - 1]) < 0) {
	if (isVowel(local, n + 1)) {
	code.append('H') // Hvowel
	}
	}

	}
	}
	'F', 'J', 'L', 'M', 'N', 'R' -> code.append(symb)
	'K' -> if (n > 0) { // not initial
	if (!isPreviousChar(local, n, 'C')) {
	code.append(symb)
	}
	} else {
	code.append(symb) // initial K
	}
	'P' -> if (isNextChar(local, n, 'H')) {
	// PH -> F
	code.append('F')
	} else {
	code.append(symb)
	}
	'Q' -> code.append('K')
	'S' -> if (regionMatch(local, n, "SH") \|\|
	regionMatch(local, n, "SIO") \|\|
	regionMatch(local, n, "SIA")
	) {
	code.append('X')
	} else {
	code.append('S')
	}
	'T' -> {
	if (regionMatch(local, n, "TIA") \|\|
	regionMatch(local, n, "TIO")
	) {
	code.append('X')

	} else if (regionMatch(local, n, "TCH")) {
	// Silent if in "TCH"
	} else if (regionMatch(local, n, "TH")) {
	// substitute numeral 0 for TH (resembles theta after all)
	code.append('0')
	} else {
	code.append('T')
	}
	}
	'V' -> code.append('F')
	'W', 'Y' -> if (!isLastChar(wdsz, n) &&
	isVowel(local, n + 1)
	) {
	code.append(symb)
	}
	'X' -> {
	code.append('K')
	code.append('S')
	}
	'Z' -> code.append('S')
	else -> {
	}
	}
	n++
	} // end else from symb != 'C'
	if (code.length > maxPhonemes) {
	code.setLength(maxPhonemes)
	}
	}
	return code.toString()
	}

	private fun isInteger(str: String?) = str?.toIntOrNull()?.let { true } ?: false

	private fun isVowel(string: java.lang.StringBuilder, index: Int): Boolean {
	return VOWELS.indexOf(string[index]) >= 0
	}

	private fun isPreviousChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
	var matches = false
	if (index > 0 &&
	index < string.length
	) {
	matches = string[index - 1] == c
	}
	return matches
	}

	private fun isNextChar(string: java.lang.StringBuilder, index: Int, c: Char): Boolean {
	var matches = false
	if (index >= 0 &&
	index < string.length - 1
	) {
	matches = string[index + 1] == c
	}
	return matches
	}

	private fun regionMatch(string: java.lang.StringBuilder, index: Int, test: String): Boolean {
	var matches = false
	if (index >= 0 &&
	index + test.length - 1 < string.length
	) {
	val substring = string.substring(index, index + test.length)
	matches = substring == test
	}
	return matches
	}

	private fun isLastChar(wdsz: Int, n: Int): Boolean {
	return n + 1 == wdsz
	}

	fun compareDoubleMetaphone(str1: String?, str2: String?, maxPhonemes: Int = 4): Boolean = metaPhone(str1, maxPhonemes) == metaPhone(str2, maxPhonemes)


	}