Skip to content

Instantly share code, notes, and snippets.

@francoiseprovencher
Forked from scotta/strikeamatch.py
Created August 18, 2014 17:05
Show Gist options
  • Save francoiseprovencher/e00e3df053e75e93c245 to your computer and use it in GitHub Desktop.
Save francoiseprovencher/e00e3df053e75e93c245 to your computer and use it in GitHub Desktop.
"""Implementation of the "Strike a match" algorithm presented in the article
http://www.catalysoft.com/articles/StrikeAMatch.html by Simon White.
Excerpt from the above URL: The similarity between two strings s1 and s2 is
twice the number of character pairs that are common to both strings divided by
the sum of the number of character pairs in the two strings. Note that the
formula rates completely dissimilar strings with a similarity value of 0, since
the size of the letter-pair intersection in the numerator of the fraction will
be zero. On the other hand, if you compare a (non-empty) string to itself, then
the similarity is 1.
"""
import collections
def _get_character_pairs(text):
"""Returns a dicttionary of adjacent character pair counts.
>>> _get_character_pairs('Test is')
defaultdict(<type 'int'>, {'IS': 1, 'TE': 1, 'ES': 1, 'ST': 1})
>>> _get_character_pairs('Test 123')
defaultdict(<type 'int'>, {'23': 1, '12': 1, 'TE': 1, 'ES': 1, 'ST': 1})
>>> _get_character_pairs('Test TEST')
defaultdict(<type 'int'>, {'TE': 2, 'ES': 2, 'ST': 2})
>>> _get_character_pairs('ai a al a')
defaultdict(<type 'int'>, {'AI': 1, 'AL': 1})
>>> _get_character_pairs('12345')
defaultdict(<type 'int'>, {'34': 1, '12': 1, '45': 1, '23': 1})
>>> _get_character_pairs('A')
defaultdict(<type 'int'>, {})
>>> _get_character_pairs('A B')
defaultdict(<type 'int'>, {})
>>> _get_character_pairs(123)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "strikeamatch.py", line 31, in _get_character_pairs
if not hasattr(text, "upper"): raise ValueError
ValueError: Invalid argument
"""
if not hasattr(text, "upper"):
raise ValueError("Invalid argument")
results = collections.defaultdict(int) # default value of 0
for word in text.upper().split():
for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
results[pair] += 1
return results
def compare_strings(string1, string2):
"""Returns a value between 0.0 and 1.0 indicating the similarity between the
two strings. A value of 1.0 is a perfect match and 0.0 is no similarity.
>>> for w in ('Sealed', 'Healthy', 'Heard', 'Herded', 'Help', 'Sold'):
... compare_strings('Healed', w)
...
0.8
0.5454545454545454
0.4444444444444444
0.4
0.25
0.0
>>> compare_strings("Horse", "Horse box")
0.8
>>> compare_strings("Horse BOX", "Horse box")
1.0
>>> compare_strings("ABCD", "AB") == compare_strings("AB", "ABCD")
True
"""
s1_pairs = _get_character_pairs(string1)
s2_pairs = _get_character_pairs(string2)
s1_size = sum(s1_pairs.values())
s2_size = sum(s2_pairs.values())
intersection_count = 0
# determine the smallest dict to optimise the calculation of the
# intersection.
if s1_size < s2_size:
smaller_dict = s1_pairs
larger_dict = s2_pairs
else:
smaller_dict = s2_pairs
larger_dict = s1_pairs
# determine the intersection by counting the subtractions we make from both
# dicts.
for pair, smaller_pair_count in smaller_dict.items():
if pair in larger_dict and larger_dict[pair] > 0:
if smaller_pair_count < larger_dict[pair]:
intersection_count += smaller_pair_count
else:
intersection_count += larger_dict[pair]
return (2.0 * intersection_count) / (s1_size + s2_size)
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment