Last active
December 10, 2015 22:58
-
-
Save chaudum/4505916 to your computer and use it in GitHub Desktop.
The higher the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The score is normalized such that 0 equates to no similarity and 1 is an exact match.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
## JARO-WINKLER-DISTANCE | |
## Ported from http://www.iugrina.com/files/JaroWinkler/JaroWinkler.phps | |
def get_common_chars(str1, str2, allowed_distance): | |
str1_len = len(str1) | |
str2_len = len(str2) | |
tmp_str2 = [str2[x] for x in xrange(len(str2))] | |
common_chars = [] | |
for i in xrange(str1_len): | |
no_match = True | |
range_min = int(max(0, i-allowed_distance)) | |
range_max = int(min(i+allowed_distance+1, str2_len)) | |
for j in xrange(range_min, range_max): | |
if no_match and tmp_str2[j] == str1[i]: | |
no_match = False | |
common_chars.append(str1[i]) | |
tmp_str2[j] = '' | |
res = "".join(common_chars) | |
return res | |
def jaro(str1, str2): | |
str1_len = len(str1) | |
str2_len = len(str2) | |
## theoretical distance | |
distance = math.floor(min(str1_len, str2_len) / 2.0) | |
## get common characters | |
commons1 = get_common_chars(str1, str2, distance) | |
commons2 = get_common_chars(str2, str1, distance) | |
commons1_len = len(commons1) | |
commons2_len = len(commons2) | |
if commons1_len == 0 or commons2_len == 0: | |
return 0 | |
## calculate transposition | |
transpositions = 0 | |
upper_bound = min(commons1_len, commons2_len) | |
for i in xrange(upper_bound): | |
if commons1[i] != commons2[i]: | |
transpositions += 1 | |
transpositions /= 2.0 | |
## return jaro distance | |
return (commons1_len/str1_len + commons2_len/str2_len + (commons1_len - transpositions)/commons1_len) / 3.0 | |
def get_prefix_length(str1, str2, MIN_PREFIX_LENGTH=4): | |
n = min(MIN_PREFIX_LENGTH, len(str1), len(str2)) | |
for i in xrange(n): | |
if str1[i] != str2[i]: | |
## return index of first occurrence of different characters | |
return i | |
return n | |
def jaro_winkler_distance(str1, str2, PREFIX_SCALE=0.1): | |
distance = jaro(str1, str2) | |
prefix_len = get_prefix_length(str1, str2) | |
return distance + prefix_len * PREFIX_SCALE * (1.0 - distance) | |
## Usage: python jaro_winkler_distance.py | |
if __name__ == "__main__": | |
str1 = raw_input('Sequence 1: ') | |
str2 = raw_input('Sequence 2: ') | |
d = jaro_winkler_distance(str1, str2) | |
print "Jaro-Winkler distance: %f" % d |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment