Created
July 26, 2016 12:00
-
-
Save karimkhanp/4b7626a933759d0113d54b09acef24bf to your computer and use it in GitHub Desktop.
Extracting the noun phrases using nltk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import stopwords | |
text = raw_input("Enter the text please ...") | |
print text | |
sentence_re = r'(?:(?:[A-Z])(?:.[A-Z])+.?)|(?:\w+(?:-\w+)*)|(?:\$?\d+(?:.\d+)?%?)|(?:...|)(?:[][.,;"\'?():-_`])' | |
lemmatizer = nltk.WordNetLemmatizer() | |
stemmer = nltk.stem.porter.PorterStemmer() | |
grammar = r""" | |
NBAR: | |
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns | |
NP: | |
{<NBAR>} | |
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc... | |
""" | |
chunker = nltk.RegexpParser(grammar) | |
toks = nltk.regexp_tokenize(text, sentence_re) | |
postoks = nltk.tag.pos_tag(toks) | |
print postoks | |
tree = chunker.parse(postoks) | |
stopwords = stopwords.words('english') | |
def leaves(tree): | |
"""Finds NP (nounphrase) leaf nodes of a chunk tree.""" | |
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'): | |
yield subtree.leaves() | |
def normalise(word): | |
"""Normalises words to lowercase and stems and lemmatizes it.""" | |
word = word.lower() | |
# word = stemmer.stem_word(word) #if we consider stemmer then results comes with stemmed word, but in this case word will not match with comment | |
word = lemmatizer.lemmatize(word) | |
return word | |
def acceptable_word(word): | |
"""Checks conditions for acceptable word: length, stopword. We can increase the length if we want to consider large phrase""" | |
accepted = bool(2 <= len(word) <= 40 | |
and word.lower() not in stopwords) | |
return accepted | |
def get_terms(tree): | |
for leaf in leaves(tree): | |
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] | |
yield term | |
terms = get_terms(tree) | |
for term in terms: | |
for word in term: | |
print word, | |
Order in the grammar matters!
grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... {<NBAR>} # If pattern is not found, just a single NBAR is ok """
works fairly well, returning
['postal', 'code', 'new', 'method'] ['delivery']
however, the acceptable_word() check deletes words. Removing the check-in get_terms() fixes this issue, returning
['postal', 'code', 'of', 'new', 'method'] ['delivery']
You'd need a different pattern to be detected if you specifically want
[postal code, new method of delivery]
Hello, could you please tell me where could I learn about this like how can I get postal code from the statement.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Order in the grammar matters!
works fairly well, returning
however, the acceptable_word() check deletes words. Removing the check in get_terms() fixes this issue, returning
You'd need a different pattern to be detected if you specifically want
[postal code, new method of delivery]