Last active
August 29, 2015 14:02
-
-
Save mjbommar/0d8d05f2ab455feaf329 to your computer and use it in GitHub Desktop.
Fuzzy sentence matching in Python - Bommarito Consulting, LLC: http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ## IPython Notebook for [Bommarito Consulting](http://bommaritollc.com/) Blog Post | |
# ### **Link**: [Fuzzy sentence matching in Python](http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python): http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python | |
# **Author**: [Michael J. Bommarito II](https://www.linkedin.com/in/bommarito/) | |
# Imports | |
import nltk.corpus | |
import nltk.tokenize.punkt | |
import nltk.stem.snowball | |
from nltk.corpus import wordnet | |
import string | |
# Get default English stopwords and extend with punctuation | |
stopwords = nltk.corpus.stopwords.words('english') | |
stopwords.extend(string.punctuation) | |
stopwords.append('') | |
def get_wordnet_pos(pos_tag): | |
if pos_tag[1].startswith('J'): | |
return (pos_tag[0], wordnet.ADJ) | |
elif pos_tag[1].startswith('V'): | |
return (pos_tag[0], wordnet.VERB) | |
elif pos_tag[1].startswith('N'): | |
return (pos_tag[0], wordnet.NOUN) | |
elif pos_tag[1].startswith('R'): | |
return (pos_tag[0], wordnet.ADV) | |
else: | |
return (pos_tag[0], wordnet.NOUN) | |
# Create tokenizer and stemmer | |
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer() | |
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() | |
def is_ci_token_stopword_lemma_match(a, b): | |
"""Check if a and b are matches.""" | |
pos_a = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(a))) | |
pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(b))) | |
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \ | |
if token.lower().strip(string.punctuation) not in stopwords] | |
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \ | |
if token.lower().strip(string.punctuation) not in stopwords] | |
return (lemmae_a == lemmae_b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment