-
-
Save alexbowe/879414 to your computer and use it in GitHub Desktop.
import nltk | |
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital | |
computer or the gears of a cycle transmission as he does at the top of a mountain | |
or in the petals of a flower. To think otherwise is to demean the Buddha...which is | |
to demean oneself.""" | |
# Used when tokenizing words | |
sentence_re = r'''(?x) # set flag to allow verbose regexps | |
([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | |
| \w+(-\w+)* # words with optional internal hyphens | |
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | |
| \.\.\. # ellipsis | |
| [][.,;"'?():-_`] # these are separate tokens | |
''' | |
lemmatizer = nltk.WordNetLemmatizer() | |
stemmer = nltk.stem.porter.PorterStemmer() | |
#Taken from Su Nam Kim Paper... | |
grammar = r""" | |
NBAR: | |
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns | |
NP: | |
{<NBAR>} | |
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc... | |
""" | |
chunker = nltk.RegexpParser(grammar) | |
toks = nltk.regexp_tokenize(text, sentence_re) | |
postoks = nltk.tag.pos_tag(toks) | |
print postoks | |
tree = chunker.parse(postoks) | |
from nltk.corpus import stopwords | |
stopwords = stopwords.words('english') | |
def leaves(tree): | |
"""Finds NP (nounphrase) leaf nodes of a chunk tree.""" | |
for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): | |
yield subtree.leaves() | |
def normalise(word): | |
"""Normalises words to lowercase and stems and lemmatizes it.""" | |
word = word.lower() | |
word = stemmer.stem_word(word) | |
word = lemmatizer.lemmatize(word) | |
return word | |
def acceptable_word(word): | |
"""Checks conditions for acceptable word: length, stopword.""" | |
accepted = bool(2 <= len(word) <= 40 | |
and word.lower() not in stopwords) | |
return accepted | |
def get_terms(tree): | |
for leaf in leaves(tree): | |
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] | |
yield term | |
terms = get_terms(tree) | |
for term in terms: | |
for word in term: | |
print word, | |
@Mohan-kr @hash-include did you solve the error you were getting for this problem ??
For the error : AttributeError: 'tuple' object has no attribute 'isdigit find the below solution.
you need to uninstall higher versions of nltk, it works for versions 3.0.
Solution ::
The default tagger is made as Perceptron in the nltk 3.1 version. Which is now the latest version. All my nltk.regexp_tokenize stopped functioning correctly and all my nltk.pos_tag started giving the above error.
The solution that I have currently is to use the previous version nltk 3.0.1 to make them functioning. I am not sure if this is a bug in the current release of nltk.
Installation instruction for nltk 3.0.4 version in ubuntu. From your home directory or any other directory do the following steps.
$ wget https://github.com/nltk/nltk/archive/3.0.4.tar.gz
$ tar -xvzf 3.0.4.tar.gz
$ cd nltk-3.0.4
$ sudo python3.4 setup.py install
you can use the following code to install nltk 3.0.4
pip install https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
it will automaticall uninstalls your latest version
/****************************************************************************/
pip install https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
Collecting https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
Downloading nltk-3.0.4.tar.gz (1.0MB)
100% |################################| 1.0MB 562kB/s
Building wheels for collected packages: nltk
Running setup.py bdist_wheel for nltk ... done
Stored in directory: C:\Users\1534038\AppData\Local\pip\Cache\wheels\8a\1e\1e\9f124d9995acdfd40f645da9592cd126f6fbe19b5e54b1c4b4
Successfully built nltk
Installing collected packages: nltk
Found existing installation: nltk 3.2.4
Uninstalling nltk-3.2.4:
Successfully uninstalled nltk-3.2.4
Successfully installed nltk-3.0.4
/**************************************************************************************************/
After this I am able to run the above code
Traceback (most recent call last):
File "nltk-intro.py", line 31, in
toks = nltk.regexp_tokenize(text, sentence_re)
File "/home/user/Desktop/nltk-3.0.4/nltk/tokenize/regexp.py", line 203, in regexp_tokenize
return tokenizer.tokenize(text)
File "/home/user/Desktop/nltk-3.0.4/nltk/tokenize/regexp.py", line 126, in tokenize
self._check_regexp()
File "/home/user/Desktop/nltk-3.0.4/nltk/tokenize/regexp.py", line 121, in _check_regexp
self._regexp = compile_regexp_to_noncapturing(self._pattern, self._flags)
File "/home/user/Desktop/nltk-3.0.4/nltk/internals.py", line 55, in compile_regexp_to_noncapturing
return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)), flags=flags)
File "/home/user/Desktop/nltk-3.0.4/nltk/internals.py", line 51, in convert_regexp_to_noncapturing_parsed
parsed_pattern.pattern.groups = 1
AttributeError: can't set attribute
Error encountered after following,
you can use the following code to install nltk 3.0.4
pip install https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
it will automaticall uninstalls your latest version
/****************************************************************************/
pip install https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
Collecting https://pypi.python.org/packages/source/n/nltk/nltk-3.0.4.tar.gz
Downloading nltk-3.0.4.tar.gz (1.0MB)
100% |################################| 1.0MB 562kB/s
Building wheels for collected packages: nltk
Running setup.py bdist_wheel for nltk ... done
Stored in directory: C:\Users\1534038\AppData\Local\pip\Cache\wheels\8a\1e\1e\9f124d9995acdfd40f645da9592cd126f6fbe19b5e54b1c4b4
Successfully built nltk
Installing collected packages: nltk
Found existing installation: nltk 3.2.4
Uninstalling nltk-3.2.4:
Successfully uninstalled nltk-3.2.4
Successfully installed nltk-3.0.4
/**************************************************************************************************/
I have made the changes suggested by @anupamchoudhari and @tejasshah93.
I am getting syntax error in the regular expression @anupamchoudhari suggested. I am using python 3.6.3 version. Any help fixing is greatly appreciated as I am a newbie in python and NLTK.
sentence_re = r'(?:(?:[A-Z])(?:.[A-Z])+.?)|(?:\w+(?:-\w+)*)|(?:$?\d+(?:.\d+)?%?)|(?:...|)(?:[][.,;"'?():-_`])'
The following regular expression seems to work in Python 3.x
sentence_re = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():_`-] # these are separate tokens; includes ], [
'''
from https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
Plus other fixes -
for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
@jamesballard Thanks! it works for me with Python 3.x
I am getting an error from running the code below:
postoks = nltk.tag.pos_tag(toks)
URLError:
Working for Python 3.6.
- line 44: change
t.node
tot.label()
- line 50:
change stemmer.stem_word(word)
tostemmer.stem(word)
Full working version:
import nltk
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
computer or the gears of a cycle transmission as he does at the top of a mountain
or in the petals of a flower. To think otherwise is to demean the Buddha...which is
to demean oneself."""
# Used when tokenizing words
sentence_re = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():_`-] # these are separate tokens; includes ], [
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
print(postoks)
tree = chunker.parse(postoks)
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and stems and lemmatizes it."""
word = word.lower()
word = stemmer.stem(word)
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):
"""Checks conditions for acceptable word: length, stopword."""
accepted = bool(2 <= len(word) <= 40
and word.lower() not in stopwords)
return accepted
def get_terms(tree):
for leaf in leaves(tree):
term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
yield term
terms = get_terms(tree)
for term in terms:
for word in term:
print(word)
print(term)
Working for Python 3.6.
- line 44: change
t.node
tot.label()
- line 50:
change stemmer.stem_word(word)
tostemmer.stem(word)
Full working version:
import nltk text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital computer or the gears of a cycle transmission as he does at the top of a mountain or in the petals of a flower. To think otherwise is to demean the Buddha...which is to demean oneself.""" # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) print(postoks) tree = chunker.parse(postoks) from nltk.corpus import stopwords stopwords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'): yield subtree.leaves() def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(tree): for leaf in leaves(tree): term = [ normalise(w) for w,t in leaf if acceptable_word(w) ] yield term terms = get_terms(tree) for term in terms: for word in term: print(word) print(term)
thank you
Thank you @Rich2020, worked for me :)
Why not using NBAR:{<NN*|JJ><NN>}? Why those dots are there?