Created
July 1, 2014 01:24
-
-
Save mjbommar/7b5bc6d61138a085443a to your computer and use it in GitHub Desktop.
Fuzzy sentence matching in Python - Bommarito Consulting, LLC: http://bommaritollc.com/2014/06/advanced-approximate-sentence-matching-python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports | |
import nltk.corpus | |
import nltk.tokenize.punkt | |
import nltk.stem.snowball | |
import string | |
# Get default English stopwords and extend with punctuation | |
stopwords = nltk.corpus.stopwords.words('english') | |
stopwords.extend(string.punctuation) | |
stopwords.append('') | |
# Create tokenizer and stemmer | |
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer() | |
stemmer = nltk.stem.snowball.SnowballStemmer('english') | |
def is_ci_stem_stopword_set_match(a, b, threshold=0.5): | |
"""Check if a and b are matches.""" | |
tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \ | |
if token.lower().strip(string.punctuation) not in stopwords] | |
tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \ | |
if token.lower().strip(string.punctuation) not in stopwords] | |
stems_a = [stemmer.stem(token) for token in tokens_a] | |
stems_b = [stemmer.stem(token) for token in tokens_b] | |
# Calculate Jaccard similarity | |
ratio = len(set(stems_a).intersection(stems_b)) / float(len(set(stems_a).union(stems_b))) | |
return (ratio >= threshold) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment