Skip to content

Instantly share code, notes, and snippets.

@arastu
Last active October 11, 2021 07:32
Show Gist options
  • Save arastu/8f72a83091b85705b922a05d9aaadaae to your computer and use it in GitHub Desktop.
Save arastu/8f72a83091b85705b922a05d9aaadaae to your computer and use it in GitHub Desktop.
Jaccard algorithms for comparing two strings and return similarity score, Port and refactor https://github.com/aceakash/string-similarity to python
def ngram_string(string, n=3, remove_space=False):
if remove_space:
string = string.replace(' ', '')
if len(string) < n:
return {string: 1}
ngrams = dict()
for i in range(len(string)-n+1):
ngram = string[i:i+n]
ngrams.setdefault(ngram, 0)
ngrams[ngram] += 1
return ngrams
def jaccard_ngrams(first, second):
intersection = 0
for ngram in first.keys():
intersection += min(first.get(ngram, 0), second.get(ngram, 0))
union = sum(first.values()) + sum(second.values())
return (2.0 * intersection) / union
def compare_two_strings(first, second, remove_space=False):
first_ngrams = ngram_string(first, remove_space=remove_space)
second_ngrams = ngram_string(second, remove_space=remove_space)
return jaccard_ngrams(first_ngrams, second_ngrams)
def find_best_match(main_string, target_strings):
ratings = [compare_two_strings(main_string, target_string) for target_string in target_strings]
best_match_index = target_strings.index(max(ratings))
return target_strings[best_match_index], ratings[best_match_index]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment