Created
August 27, 2017 14:04
-
-
Save victor-iyi/725733d4e1bd8d9c4405ae571320fd53 to your computer and use it in GitHub Desktop.
Guide on using part of speech tagging with NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import state_union | |
from nltk.tokenize import PunktSentenceTokenizer | |
''' | |
POS tag list: | |
CC coordinating conjunction | |
CD cardinal digit | |
DT determiner | |
EX existential there (like: "there is" ... think of it like "there exists") | |
FW foreign word | |
IN preposition/subordinating conjunction | |
JJ adjective 'big' | |
JJR adjective, comparative 'bigger' | |
JJS adjective, superlative 'biggest' | |
LS list marker 1) | |
MD modal could, will | |
NN noun, singular 'desk' | |
NNS noun plural 'desks' | |
NNP proper noun, singular 'Harrison' | |
NNPS proper noun, plural 'Americans' | |
PDT predeterminer 'all the kids' | |
POS possessive ending parent's | |
PRP personal pronoun I, he, she | |
PRP$ possessive pronoun my, his, hers | |
RB adverb very, silently, | |
RBR adverb, comparative better | |
RBS adverb, superlative best | |
RP particle give up | |
TO to go 'to' the store. | |
UH interjection errrrrrrrm | |
VB verb, base form take | |
VBD verb, past tense took | |
VBG verb, gerund/present participle taking | |
VBN verb, past participle taken | |
VBP verb, sing. present, non-3d take | |
VBZ verb, 3rd person sing. present takes | |
WDT wh-determiner which | |
WP wh-pronoun who, what | |
WP$ possessive wh-pronoun whose | |
WRB wh-abverb where, when | |
''' | |
train_text = state_union.raw('2005-GWBush.txt') | |
sample_text = state_union.raw('2006-GWBush.txt') | |
tokenizer = PunktSentenceTokenizer(train_text) | |
sent_tokens = tokenizer.tokenize(sample_text) | |
for sent in sent_tokens[:10]: | |
tokens = nltk.word_tokenize(sent) | |
pos = nltk.pos_tag(tokens) | |
print(pos) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment