Created
May 28, 2016 11:15
-
-
Save fx86/b8ac66184cbfc90a087e9a73f118b0f9 to your computer and use it in GitHub Desktop.
A much more effective algorithm for string similarity found on Stack Overflow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipedia | |
def get_bigrams(string): | |
''' | |
Takes a string and returns a list of bigrams | |
''' | |
s = string.lower() | |
return [s[i:i+2] for i in xrange(len(s) - 1)] | |
def string_similarity(str1, str2): | |
''' | |
Perform bigram comparison between two strings | |
and return a percentage match in decimal form | |
source: http://stackoverflow.com/a/6859596/170005 | |
''' | |
pairs1 = get_bigrams(str1) | |
pairs2 = get_bigrams(str2) | |
union = len(pairs1) + len(pairs2) | |
hit_count = 0 | |
for x in pairs1: | |
for y in pairs2: | |
if x == y: | |
hit_count += 1 | |
break | |
return (2.0 * hit_count) / union | |
def closest_wiki_result(film): | |
closest_match = 0 | |
max_score = 0 | |
search_results = wiki.search(film) | |
print "Search results : " | |
for ind, result in enumerate(search_results): | |
print "\t", result | |
score = string_similarity(film, result) | |
if score > max_score: | |
max_score = score | |
closest_match = ind | |
print '\nClosest match for ', film, ' is :', search_results[closest_match] | |
# add the word ' film' to the movie name | |
# even if the original name has it | |
film = 'Jurassic World (2015) film' | |
closest_wiki_result(film) | |
### Expected output | |
# Search results : | |
# Jurassic Park | |
# Jurassic | |
# Jurassic Park (film) | |
# Jurassic Park III | |
# Jurassic World | |
# Jurassic World: Original Motion Picture Soundtrack | |
# The Lost World: Jurassic Park | |
# The Lost World: Jurassic Park (Sega game) | |
# List of Jurassic Park video games | |
# Lego Jurassic World | |
# | |
# Closest match for Jurassic World (2015) film is : Jurassic World |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
👍