Created
March 21, 2011 12:59
-
-
Save alexbowe/879414 to your computer and use it in GitHub Desktop.
Demonstration of extracting key phrases with NLTK in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital | |
computer or the gears of a cycle transmission as he does at the top of a mountain | |
or in the petals of a flower. To think otherwise is to demean the Buddha...which is | |
to demean oneself.""" | |
# Used when tokenizing words | |
sentence_re = r'''(?x) # set flag to allow verbose regexps | |
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | |
| \w+(-\w+)* # words with optional internal hyphens | |
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | |
| \.\.\. # ellipsis | |
| [][.,;"'?():-_`] # these are separate tokens | |
''' | |
lemmatizer = nltk.WordNetLemmatizer() | |
stemmer = nltk.stem.porter.PorterStemmer() | |
#Taken from Su Nam Kim Paper... | |
grammar = r""" | |
NBAR: | |
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns | |
NP: | |
{<NBAR>} | |
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc... | |
""" | |
chunker = nltk.RegexpParser(grammar) | |
toks = nltk.regexp_tokenize(text, sentence_re) | |
postoks = nltk.tag.pos_tag(toks) | |
print postoks | |
tree = chunker.parse(postoks) | |
from nltk.corpus import stopwords | |
stopwords = stopwords.words('english') | |
def leaves(tree): | |
"""Finds NP (nounphrase) leaf nodes of a chunk tree.""" | |
for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): | |
yield subtree.leaves() | |
def normalise(word): | |
"""Normalises words to lowercase and stems and lemmatizes it.""" | |
word = word.lower() | |
word = stemmer.stem_word(word) | |
word = lemmatizer.lemmatize(word) | |
return word | |
def acceptable_word(word): | |
"""Checks conditions for acceptable word: length, stopword.""" | |
accepted = bool(2 <= len(word) <= 40 | |
and word.lower() not in stopwords) | |
return accepted | |
def get_terms(tree): | |
for leaf in leaves(tree): | |
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] | |
yield term | |
terms = get_terms(tree) | |
for term in terms: | |
for word in term: | |
print word, | |
@jamesballard Thanks! it works for me with Python 3.x
I am getting an error from running the code below:
postoks = nltk.tag.pos_tag(toks)
URLError:
Working for Python 3.6.
- line 44: change
t.node
tot.label()
- line 50:
change stemmer.stem_word(word)
tostemmer.stem(word)
Full working version:
import nltk
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
computer or the gears of a cycle transmission as he does at the top of a mountain
or in the petals of a flower. To think otherwise is to demean the Buddha...which is
to demean oneself."""
# Used when tokenizing words
sentence_re = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():_`-] # these are separate tokens; includes ], [
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
print(postoks)
tree = chunker.parse(postoks)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
word = stemmer.stem(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):
"""Checks conditions for acceptable word: length, stopword."""
accepted = bool(2 <= len(word) <= 40
and word.lower() not in stopwords)
return accepted
def get_terms(tree):
for leaf in leaves(tree):
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
yield term
terms = get_terms(tree)
for term in terms:
for word in term:
print(word)
print(term)
Working for Python 3.6.
- line 44: change
t.node
tot.label()
- line 50:
change stemmer.stem_word(word)
tostemmer.stem(word)
Full working version:
import nltk text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital computer or the gears of a cycle transmission as he does at the top of a mountain or in the petals of a flower. To think otherwise is to demean the Buddha...which is to demean oneself.""" # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) print(postoks) tree = chunker.parse(postoks) from nltk.corpus import stopwords stopwords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'): yield subtree.leaves() def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(tree): for leaf in leaves(tree): term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] yield term terms = get_terms(tree) for term in terms: for word in term: print(word) print(term)
thank you
Thank you @Rich2020, worked for me :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The following regular expression seems to work in Python 3.x
from https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
Plus other fixes -