Created
April 15, 2013 22:06
-
-
Save saml/5391660 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
NON_WORD = re.compile(r'\W+') | |
target = '55 w. 45th st.' | |
candidates = [ | |
'11 park pl | btwn broadway & church st ', | |
'217 e 86th st | at 3rd ave ', | |
'2 e 33rd st | at 5th ave ', | |
"23 3rd ave | btwn st mark's pl & stuyvesant st", | |
'55 w 45th st | at 6th ave ', | |
'2047 broadway | btwn 70th & 71st st ', | |
'401 7th ave | btwn 32nd & 33rd st ', | |
'1 e 43rd st | btwn 5th & madison ave', | |
'122 university pl | btwn 13th & 14th st '] | |
def bag(sentence): | |
words = sanitize(sentence).split() | |
return set(words) | |
def tanimoto(a, b): | |
return len(a.intersection(b)) * 1.0/ len(a.union(b)) | |
def sanitize(s): | |
return NON_WORD.sub(' ', s) | |
target_bag = bag(target) | |
for candidate in candidates: | |
score = tanimoto(target_bag, bag(candidate)) | |
print('%f\t%s ~ %s' % (score, target, candidate)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment