Skip to content

Instantly share code, notes, and snippets.

@natematias
Last active August 29, 2015 14:01
Show Gist options
  • Save natematias/75aab9f81086d8ccc82a to your computer and use it in GitHub Desktop.
Save natematias/75aab9f81086d8ccc82a to your computer and use it in GitHub Desktop.
Tagging parts of speech in NLTK
import nltk
from urllib import urlopen
# download alice in wonderland
url = "http://www.gutenberg.org/cache/epub/11/pg11.txt"
raw_text = urlopen(url).read()
tokens = nltk.word_tokenize(raw_text)
freq_dist = nltk.FreqDist(tokens)
#freq_dist.plot(50, cumulative=False)
# === simple part of speech tagger ===
pos_tokens = [(x[0].lower(),x[1]) for x in nltk.pos_tag(tokens)]
# to list part of speech tagsets
# nltk.help.upenn_tagset()
# nltk.help.brown_tagset()
# frequency distribution of pronouns (PRO)
pronoun_freq_dist = nltk.FreqDist([x for x in pos_tokens if ( x[1] == "PRP" or x[1] == "PRP$")])
#pronoun_freq_dist.plot(50, cumulative=False)
# frequency distribution of adjectives (ADJ)
adj_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "JJ" or x[1] == "JJR" or x[1] == "JJS"])
#adj_freq_dist.plot(50, cumulative=False)
#frequency distribution of adverbs
adv_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "RB" or x[1] == "RBS" or x[1] == "RBR"])
#adv_freq_dist.plot(50, cumulative=False)
#frequency distribution of verbs
verb_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "VB" or x[1] == "VBD" or x[1] == "VBG" or x[1] == "VBN" or x[1] == "VBP" or x[1] == "VBZ"])
#verb_freq_dist.plot(50, cumulative=False)
@ananelson
Copy link

can also do if x[1] in ("JJ'", "JJR", "JJS") or better yet if x[1] in adj_tokens

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment