Skip to content

Instantly share code, notes, and snippets.

@saml
Created April 15, 2013 22:06
Show Gist options
  • Save saml/5391660 to your computer and use it in GitHub Desktop.
Save saml/5391660 to your computer and use it in GitHub Desktop.
import re
NON_WORD = re.compile(r'\W+')
target = '55 w. 45th st.'
candidates = [
'11 park pl | btwn broadway & church st ',
'217 e 86th st | at 3rd ave ',
'2 e 33rd st | at 5th ave ',
"23 3rd ave | btwn st mark's pl & stuyvesant st",
'55 w 45th st | at 6th ave ',
'2047 broadway | btwn 70th & 71st st ',
'401 7th ave | btwn 32nd & 33rd st ',
'1 e 43rd st | btwn 5th & madison ave',
'122 university pl | btwn 13th & 14th st ']
def bag(sentence):
words = sanitize(sentence).split()
return set(words)
def tanimoto(a, b):
return len(a.intersection(b)) * 1.0/ len(a.union(b))
def sanitize(s):
return NON_WORD.sub(' ', s)
target_bag = bag(target)
for candidate in candidates:
score = tanimoto(target_bag, bag(candidate))
print('%f\t%s ~ %s' % (score, target, candidate))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment