Created
May 9, 2017 18:29
-
-
Save TApicella/fd7a5ba8e48a36ffac5f3385695082c2 to your computer and use it in GitHub Desktop.
RosettaCode JaroDistance created by tapicella - https://repl.it/Hous/36
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
http://rosettacode.org/wiki/Jaro_distance | |
d = 0 if no matching characters | |
otherwise d = (1/3) * (m/len(s1) + m/len(s2) + (m-(transpose/2))/m ) | |
characters match if distance >= (max(len(s1), len(s2))/2)-1 | |
if distance != 0, counts as a transpose | |
''' | |
def JaroDistance(s1, s2): | |
l1 = len(s1) | |
l2 = len(s2) | |
chars1 = list(s1) | |
chars2 = list(s2) | |
dist = (max(l1, l2)//2)-1 | |
m = 0 | |
t = 0 | |
#Find matches | |
matches1 = [] | |
matches2 = [] | |
for i in range(min(l1, l2)+dist): #check only characters in longer string that are in range | |
min_index = max(0, i-dist) | |
max_index_1 = min(l1, (i+dist+1)) | |
max_index_2 = min(l2, (i+dist+1)) | |
if i<l1 and chars1[i] in chars2[min_index:max_index_2]: | |
matches1.append(chars1[i]) | |
if i<l2 and chars2[i] in chars1[min_index:max_index_1]: | |
matches2.append(chars2[i]) | |
m = len(matches1) | |
#Find transposes | |
for j in range(m): | |
for k in range(m): | |
if matches1[j]==matches2[k]: | |
matches1[j] = "T1" | |
matches2[k] = "T2" | |
if j!=k: | |
t+=1 | |
#Symmetric transposes i.e. AB and BA should only count once. Compare to ABC CAB. | |
if matches1[k]==matches2[j]: | |
matches1[k] = "T1" | |
matches2[j] = "T2" | |
print("L1: %d, L2: %d, m: %d, t: %d" % (l1, l2, m, t)) | |
if m==0: | |
jdist = 0 | |
else: | |
jdist = (1/3) * ((m/l1)+(m/l2)+((m-t)/m)) | |
return round(jdist, 5) | |
print("Jaro Distance for %s and %s is %s" % ("DWAYNE", "DUANE", JaroDistance("DWAYNE", "DUANE"))) | |
print("Jaro Distance for %s and %s is %s" % ("MARTHA", "MARHTA", JaroDistance("MARTHA", "MARHTA"))) | |
print("Jaro Distance for %s and %s is %s" % ("DIXON", "DICKSONX", JaroDistance("DIXON", "DICKSONX"))) | |
print("Jaro Distance for %s and %s is %s" % ("JELLYFISH", "SMELLYFISH", JaroDistance("JELLYFISH", "SMELLYFISH"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment