Created
December 21, 2020 18:52
-
-
Save nicklockwood/27f45d8e8710bc53ad1fbbdfbeb7c68f to your computer and use it in GitHub Desktop.
Swift code to calculate the Jaro-Winkler edit distance between two strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// The Jaro-Winkler edit distance between two strings (0 - 1) | |
func editDistance(_ lhs: String, _ rhs: String) -> Double { | |
return 1 - jaroWinklerSimilarity(Array(lhs), Array(rhs)) | |
} | |
/// Jaro-Winkler similarity between two strings (0 - 1) | |
/// https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/ | |
private func jaroWinklerSimilarity(_ s1: [Character], _ s2: [Character]) -> Double { | |
var jaro = jaroSimilarity(s1, s2) | |
// If the jaro Similarity is above a threshold | |
if jaro > 0.7 { | |
// Find the length of common prefix | |
var prefix = 0.0 | |
for i in 0 ..< min(s1.count, s2.count) { | |
// If the characters match | |
if s1[i] == s2[i] { | |
prefix += 1 | |
} else { | |
break | |
} | |
} | |
// Maximum of 4 characters are allowed in prefix | |
prefix = min(4, prefix) | |
// Calculate jaro winkler Similarity | |
jaro += 0.1 * prefix * (1 - jaro) | |
} | |
return jaro | |
} | |
/// The Jaro similarity between two strings (0 - 1) | |
/// https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/ | |
private func jaroSimilarity(_ s1: [Character], _ s2: [Character]) -> Double { | |
// If the Strings are equal | |
if s1 == s2 { | |
return 1 | |
} | |
// Lengths | |
let len1 = s1.count | |
let len2 = s2.count | |
// Maximum distance up to which matching is allowed | |
let maxDist = Int(Double(max(len1, len2) / 2) - 1) | |
// Number of matches | |
var matches = 0.0 | |
// Hash for matches | |
var hash1 = [Int](repeating: 0, count: len1) | |
var hash2 = [Int](repeating: 0, count: len2) | |
// Traverse through the first String | |
for i in 0 ..< len1 { | |
// Check for matches | |
let start = max(0, i - maxDist) | |
for j in start ..< max(start, min(len2, i + maxDist + 1)) { | |
// If there is a match | |
if s1[i] == s2[j], hash2[j] == 0 { | |
hash1[i] = 1 | |
hash2[j] = 1 | |
matches += 1 | |
break | |
} | |
} | |
} | |
// If there is no match | |
if matches == 0 { | |
return 0 | |
} | |
// Number of transpositions | |
var t = 0.0 | |
// Count number of occurances where two characters match but | |
// there is a third matched character in between the indices | |
var j = 0 | |
for i in 0 ..< len1 where hash1[i] == 1 { | |
// Find the next matched character | |
// in second String | |
while hash2[j] == 0 { | |
j += 1 | |
} | |
if s1[i] != s2[j] { | |
j += 1 | |
t += 1 | |
} | |
} | |
// Return the Jaro Similarity | |
return (matches / Double(len1) + matches / Double(len2) + (matches - t / 2) / matches) / 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment