Last active
August 25, 2021 21:13
-
-
Save scotta/1063364 to your computer and use it in GitHub Desktop.
Python implementation of the "Strike a match" algorithm presented in the article http://www.catalysoft.com/articles/StrikeAMatch.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Implementation of the "Strike a match" algorithm presented in the article | |
http://www.catalysoft.com/articles/StrikeAMatch.html by Simon White. | |
Excerpt from the above URL: The similarity between two strings s1 and s2 is | |
twice the number of character pairs that are common to both strings divided by | |
the sum of the number of character pairs in the two strings. Note that the | |
formula rates completely dissimilar strings with a similarity value of 0, since | |
the size of the letter-pair intersection in the numerator of the fraction will | |
be zero. On the other hand, if you compare a (non-empty) string to itself, then | |
the similarity is 1. | |
""" | |
import collections | |
def _get_character_pairs(text): | |
"""Returns a dicttionary of adjacent character pair counts. | |
>>> _get_character_pairs('Test is') | |
defaultdict(<type 'int'>, {'IS': 1, 'TE': 1, 'ES': 1, 'ST': 1}) | |
>>> _get_character_pairs('Test 123') | |
defaultdict(<type 'int'>, {'23': 1, '12': 1, 'TE': 1, 'ES': 1, 'ST': 1}) | |
>>> _get_character_pairs('Test TEST') | |
defaultdict(<type 'int'>, {'TE': 2, 'ES': 2, 'ST': 2}) | |
>>> _get_character_pairs('ai a al a') | |
defaultdict(<type 'int'>, {'AI': 1, 'AL': 1}) | |
>>> _get_character_pairs('12345') | |
defaultdict(<type 'int'>, {'34': 1, '12': 1, '45': 1, '23': 1}) | |
>>> _get_character_pairs('A') | |
defaultdict(<type 'int'>, {}) | |
>>> _get_character_pairs('A B') | |
defaultdict(<type 'int'>, {}) | |
>>> _get_character_pairs(123) | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "strikeamatch.py", line 31, in _get_character_pairs | |
if not hasattr(text, "upper"): raise ValueError | |
ValueError: Invalid argument | |
""" | |
if not hasattr(text, "upper"): | |
raise ValueError("Invalid argument") | |
results = collections.defaultdict(int) # default value of 0 | |
for word in text.upper().split(): | |
for pair in [word[i]+word[i+1] for i in range(len(word)-1)]: | |
results[pair] += 1 | |
return results | |
def compare_strings(string1, string2): | |
"""Returns a value between 0.0 and 1.0 indicating the similarity between the | |
two strings. A value of 1.0 is a perfect match and 0.0 is no similarity. | |
>>> for w in ('Sealed', 'Healthy', 'Heard', 'Herded', 'Help', 'Sold'): | |
... compare_strings('Healed', w) | |
... | |
0.8 | |
0.5454545454545454 | |
0.4444444444444444 | |
0.4 | |
0.25 | |
0.0 | |
>>> compare_strings("Horse", "Horse box") | |
0.8 | |
>>> compare_strings("Horse BOX", "Horse box") | |
1.0 | |
>>> compare_strings("ABCD", "AB") == compare_strings("AB", "ABCD") | |
True | |
""" | |
s1_pairs = _get_character_pairs(string1) | |
s2_pairs = _get_character_pairs(string2) | |
s1_size = sum(s1_pairs.values()) | |
s2_size = sum(s2_pairs.values()) | |
intersection_count = 0 | |
# determine the smallest dict to optimise the calculation of the | |
# intersection. | |
if s1_size < s2_size: | |
smaller_dict = s1_pairs | |
larger_dict = s2_pairs | |
else: | |
smaller_dict = s2_pairs | |
larger_dict = s1_pairs | |
# determine the intersection by counting the subtractions we make from both | |
# dicts. | |
for pair, smaller_pair_count in smaller_dict.items(): | |
if pair in larger_dict and larger_dict[pair] > 0: | |
if smaller_pair_count < larger_dict[pair]: | |
intersection_count += smaller_pair_count | |
else: | |
intersection_count += larger_dict[pair] | |
return (2.0 * intersection_count) / (s1_size + s2_size) | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod() | |
Thanks, good spot. I've modified it to use a default dict.
Scotta,
Good work, but let me know how to run this script.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
results = dict()
We will have:
import collections
result = collections.defaultdict(lambda:1)