Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created July 1, 2014 00:59
Show Gist options
  • Save mjbommar/a699a7096c36c30eec6d to your computer and use it in GitHub Desktop.
Save mjbommar/a699a7096c36c30eec6d to your computer and use it in GitHub Desktop.
Fuzzy sentence matching in Python - Bommarito Consulting, LLC: http://bommaritollc.com/2014/06/advanced-approximate-sentence-matching-python
# Imports
import nltk.corpus
import nltk.tokenize.punkt
import nltk.stem.snowball
import string
# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
# Create tokenizer and stemmer
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
def is_ci_token_stopword_set_match(a, b, threshold=0.5):
"""Check if a and b are matches."""
tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
if token.lower().strip(string.punctuation) not in stopwords]
tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
if token.lower().strip(string.punctuation) not in stopwords]
# Calculate Jaccard similarity
ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b)))
return (ratio >= threshold)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment