Created
March 21, 2011 12:59
-
-
Save alexbowe/879414 to your computer and use it in GitHub Desktop.
Demonstration of extracting key phrases with NLTK in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital | |
computer or the gears of a cycle transmission as he does at the top of a mountain | |
or in the petals of a flower. To think otherwise is to demean the Buddha...which is | |
to demean oneself.""" | |
# Used when tokenizing words | |
sentence_re = r'''(?x) # set flag to allow verbose regexps | |
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | |
| \w+(-\w+)* # words with optional internal hyphens | |
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | |
| \.\.\. # ellipsis | |
| [][.,;"'?():-_`] # these are separate tokens | |
''' | |
lemmatizer = nltk.WordNetLemmatizer() | |
stemmer = nltk.stem.porter.PorterStemmer() | |
#Taken from Su Nam Kim Paper... | |
grammar = r""" | |
NBAR: | |
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns | |
NP: | |
{<NBAR>} | |
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc... | |
""" | |
chunker = nltk.RegexpParser(grammar) | |
toks = nltk.regexp_tokenize(text, sentence_re) | |
postoks = nltk.tag.pos_tag(toks) | |
print postoks | |
tree = chunker.parse(postoks) | |
from nltk.corpus import stopwords | |
stopwords = stopwords.words('english') | |
def leaves(tree): | |
"""Finds NP (nounphrase) leaf nodes of a chunk tree.""" | |
for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): | |
yield subtree.leaves() | |
def normalise(word): | |
"""Normalises words to lowercase and stems and lemmatizes it.""" | |
word = word.lower() | |
word = stemmer.stem_word(word) | |
word = lemmatizer.lemmatize(word) | |
return word | |
def acceptable_word(word): | |
"""Checks conditions for acceptable word: length, stopword.""" | |
accepted = bool(2 <= len(word) <= 40 | |
and word.lower() not in stopwords) | |
return accepted | |
def get_terms(tree): | |
for leaf in leaves(tree): | |
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] | |
yield term | |
terms = get_terms(tree) | |
for term in terms: | |
for word in term: | |
print word, | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thank you