Skip to content

Instantly share code, notes, and snippets.

@scotta
Last active August 25, 2021 21:13
Show Gist options
  • Save scotta/1063364 to your computer and use it in GitHub Desktop.
Save scotta/1063364 to your computer and use it in GitHub Desktop.
Python implementation of the "Strike a match" algorithm presented in the article http://www.catalysoft.com/articles/StrikeAMatch.html
"""Implementation of the "Strike a match" algorithm presented in the article
http://www.catalysoft.com/articles/StrikeAMatch.html by Simon White.
Excerpt from the above URL: The similarity between two strings s1 and s2 is
twice the number of character pairs that are common to both strings divided by
the sum of the number of character pairs in the two strings. Note that the
formula rates completely dissimilar strings with a similarity value of 0, since
the size of the letter-pair intersection in the numerator of the fraction will
be zero. On the other hand, if you compare a (non-empty) string to itself, then
the similarity is 1.
"""
import collections
def _get_character_pairs(text):
"""Returns a dicttionary of adjacent character pair counts.
>>> _get_character_pairs('Test is')
defaultdict(<type 'int'>, {'IS': 1, 'TE': 1, 'ES': 1, 'ST': 1})
>>> _get_character_pairs('Test 123')
defaultdict(<type 'int'>, {'23': 1, '12': 1, 'TE': 1, 'ES': 1, 'ST': 1})
>>> _get_character_pairs('Test TEST')
defaultdict(<type 'int'>, {'TE': 2, 'ES': 2, 'ST': 2})
>>> _get_character_pairs('ai a al a')
defaultdict(<type 'int'>, {'AI': 1, 'AL': 1})
>>> _get_character_pairs('12345')
defaultdict(<type 'int'>, {'34': 1, '12': 1, '45': 1, '23': 1})
>>> _get_character_pairs('A')
defaultdict(<type 'int'>, {})
>>> _get_character_pairs('A B')
defaultdict(<type 'int'>, {})
>>> _get_character_pairs(123)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "strikeamatch.py", line 31, in _get_character_pairs
if not hasattr(text, "upper"): raise ValueError
ValueError: Invalid argument
"""
if not hasattr(text, "upper"):
raise ValueError("Invalid argument")
results = collections.defaultdict(int) # default value of 0
for word in text.upper().split():
for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
results[pair] += 1
return results
def compare_strings(string1, string2):
"""Returns a value between 0.0 and 1.0 indicating the similarity between the
two strings. A value of 1.0 is a perfect match and 0.0 is no similarity.
>>> for w in ('Sealed', 'Healthy', 'Heard', 'Herded', 'Help', 'Sold'):
... compare_strings('Healed', w)
...
0.8
0.5454545454545454
0.4444444444444444
0.4
0.25
0.0
>>> compare_strings("Horse", "Horse box")
0.8
>>> compare_strings("Horse BOX", "Horse box")
1.0
>>> compare_strings("ABCD", "AB") == compare_strings("AB", "ABCD")
True
"""
s1_pairs = _get_character_pairs(string1)
s2_pairs = _get_character_pairs(string2)
s1_size = sum(s1_pairs.values())
s2_size = sum(s2_pairs.values())
intersection_count = 0
# determine the smallest dict to optimise the calculation of the
# intersection.
if s1_size < s2_size:
smaller_dict = s1_pairs
larger_dict = s2_pairs
else:
smaller_dict = s2_pairs
larger_dict = s1_pairs
# determine the intersection by counting the subtractions we make from both
# dicts.
for pair, smaller_pair_count in smaller_dict.items():
if pair in larger_dict and larger_dict[pair] > 0:
if smaller_pair_count < larger_dict[pair]:
intersection_count += smaller_pair_count
else:
intersection_count += larger_dict[pair]
return (2.0 * intersection_count) / (s1_size + s2_size)
if __name__ == "__main__":
import doctest
doctest.testmod()
Copy link

ghost commented Feb 22, 2013

results = dict()

for word in text.upper().split():
    for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
        if pair in results:
            results[pair] += 1
        else:
            results[pair] = 1
return results

We will have:

import collections
result = collections.defaultdict(lambda:1)

for word in text.upper().split():
    for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
        if pair in results:
            results[pair] += 1
return results

@scotta
Copy link
Author

scotta commented Apr 14, 2013

Thanks, good spot. I've modified it to use a default dict.

@manjur12
Copy link

Scotta,
Good work, but let me know how to run this script.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment