francoiseprovencher · August 18, 2014 17:05
diff --git a/strikeamatch.py b/strikeamatch.py
 """Implementation of the "Strike a match" algorithm presented in the article  
 http://www.catalysoft.com/articles/StrikeAMatch.html by Simon White.
 
 Excerpt from the above URL: The similarity between two strings s1 and s2 is
 twice the number of character pairs that are common to both strings divided by
 the sum of the number of character pairs in the two strings. Note that the
 formula rates completely dissimilar strings with a similarity value of 0, since
 the size of the letter-pair intersection in the numerator of the fraction will
 be zero. On the other hand, if you compare a (non-empty) string to itself, then
 the similarity is 1.
 
 """
 import collections

 def _get_character_pairs(text):
    """Returns a dicttionary of adjacent character pair counts.
 
    >>> _get_character_pairs('Test is')
    defaultdict(<type 'int'>, {'IS': 1, 'TE': 1, 'ES': 1, 'ST': 1})
    >>> _get_character_pairs('Test 123')
    defaultdict(<type 'int'>, {'23': 1, '12': 1, 'TE': 1, 'ES': 1, 'ST': 1})
    >>> _get_character_pairs('Test TEST')
    defaultdict(<type 'int'>, {'TE': 2, 'ES': 2, 'ST': 2})
    >>> _get_character_pairs('ai a al a')
    defaultdict(<type 'int'>, {'AI': 1, 'AL': 1})
    >>> _get_character_pairs('12345')
    defaultdict(<type 'int'>, {'34': 1, '12': 1, '45': 1, '23': 1})
    >>> _get_character_pairs('A')
    defaultdict(<type 'int'>, {})
    >>> _get_character_pairs('A B')
    defaultdict(<type 'int'>, {})
    >>> _get_character_pairs(123)
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "strikeamatch.py", line 31, in _get_character_pairs
        if not hasattr(text, "upper"): raise ValueError
    ValueError: Invalid argument

    """

    if not hasattr(text, "upper"):
        raise ValueError("Invalid argument")

    results = collections.defaultdict(int)  # default value of 0

    for word in text.upper().split():
        for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
            results[pair] += 1
    return results

 def compare_strings(string1, string2):
    """Returns a value between 0.0 and 1.0 indicating the similarity between the
    two strings. A value of 1.0 is a perfect match and 0.0 is no similarity.
 
    >>> for w in ('Sealed', 'Healthy', 'Heard', 'Herded', 'Help', 'Sold'):
    ...     compare_strings('Healed', w)
    ... 
    0.8
    0.5454545454545454
    0.4444444444444444
    0.4
    0.25
    0.0
    >>> compare_strings("Horse", "Horse box")
    0.8
    >>> compare_strings("Horse BOX", "Horse box")
    1.0
    >>> compare_strings("ABCD", "AB") == compare_strings("AB", "ABCD") 
    True

    """

    s1_pairs = _get_character_pairs(string1)
    s2_pairs = _get_character_pairs(string2)

    s1_size = sum(s1_pairs.values())
    s2_size = sum(s2_pairs.values())

    intersection_count = 0

    # determine the smallest dict to optimise the calculation of the
    # intersection.
    if s1_size < s2_size:
        smaller_dict = s1_pairs
        larger_dict = s2_pairs
    else:
        smaller_dict = s2_pairs
        larger_dict = s1_pairs

    # determine the intersection by counting the subtractions we make from both
    # dicts.
    for pair, smaller_pair_count in smaller_dict.items():
        if pair in larger_dict and larger_dict[pair] > 0:
            if smaller_pair_count < larger_dict[pair]:
                intersection_count += smaller_pair_count
            else:
                intersection_count += larger_dict[pair]

    return (2.0 * intersection_count) / (s1_size + s2_size)

 if __name__ == "__main__":
    import doctest
    doctest.testmod()
	"""Implementation of the "Strike a match" algorithm presented in the article
	http://www.catalysoft.com/articles/StrikeAMatch.html by Simon White.

	Excerpt from the above URL: The similarity between two strings s1 and s2 is
	twice the number of character pairs that are common to both strings divided by
	the sum of the number of character pairs in the two strings. Note that the
	formula rates completely dissimilar strings with a similarity value of 0, since
	the size of the letter-pair intersection in the numerator of the fraction will
	be zero. On the other hand, if you compare a (non-empty) string to itself, then
	the similarity is 1.

	"""
	import collections

	def _get_character_pairs(text):
	"""Returns a dicttionary of adjacent character pair counts.

	>>> _get_character_pairs('Test is')
	defaultdict(<type 'int'>, {'IS': 1, 'TE': 1, 'ES': 1, 'ST': 1})
	>>> _get_character_pairs('Test 123')
	defaultdict(<type 'int'>, {'23': 1, '12': 1, 'TE': 1, 'ES': 1, 'ST': 1})
	>>> _get_character_pairs('Test TEST')
	defaultdict(<type 'int'>, {'TE': 2, 'ES': 2, 'ST': 2})
	>>> _get_character_pairs('ai a al a')
	defaultdict(<type 'int'>, {'AI': 1, 'AL': 1})
	>>> _get_character_pairs('12345')
	defaultdict(<type 'int'>, {'34': 1, '12': 1, '45': 1, '23': 1})
	>>> _get_character_pairs('A')
	defaultdict(<type 'int'>, {})
	>>> _get_character_pairs('A B')
	defaultdict(<type 'int'>, {})
	>>> _get_character_pairs(123)
	Traceback (most recent call last):
	File "<stdin>", line 1, in <module>
	File "strikeamatch.py", line 31, in _get_character_pairs
	if not hasattr(text, "upper"): raise ValueError
	ValueError: Invalid argument

	"""

	if not hasattr(text, "upper"):
	raise ValueError("Invalid argument")

	results = collections.defaultdict(int) # default value of 0

	for word in text.upper().split():
	for pair in [word[i]+word[i+1] for i in range(len(word)-1)]:
	results[pair] += 1
	return results

	def compare_strings(string1, string2):
	"""Returns a value between 0.0 and 1.0 indicating the similarity between the
	two strings. A value of 1.0 is a perfect match and 0.0 is no similarity.

	>>> for w in ('Sealed', 'Healthy', 'Heard', 'Herded', 'Help', 'Sold'):
	... compare_strings('Healed', w)
	...
	0.8
	0.5454545454545454
	0.4444444444444444
	0.4
	0.25
	0.0
	>>> compare_strings("Horse", "Horse box")
	0.8
	>>> compare_strings("Horse BOX", "Horse box")
	1.0
	>>> compare_strings("ABCD", "AB") == compare_strings("AB", "ABCD")
	True

	"""

	s1_pairs = _get_character_pairs(string1)
	s2_pairs = _get_character_pairs(string2)

	s1_size = sum(s1_pairs.values())
	s2_size = sum(s2_pairs.values())

	intersection_count = 0

	# determine the smallest dict to optimise the calculation of the
	# intersection.
	if s1_size < s2_size:
	smaller_dict = s1_pairs
	larger_dict = s2_pairs
	else:
	smaller_dict = s2_pairs
	larger_dict = s1_pairs

	# determine the intersection by counting the subtractions we make from both
	# dicts.
	for pair, smaller_pair_count in smaller_dict.items():
	if pair in larger_dict and larger_dict[pair] > 0:
	if smaller_pair_count < larger_dict[pair]:
	intersection_count += smaller_pair_count
	else:
	intersection_count += larger_dict[pair]

	return (2.0 * intersection_count) / (s1_size + s2_size)

	if __name__ == "__main__":
	import doctest
	doctest.testmod()