natematias · August 29, 2015 14:01 · ananelson · May 19, 2014
diff --git a/Part of Speech Tagging b/Part of Speech Tagging
 import nltk
 from urllib import urlopen

 # download alice in wonderland
 url = "http://www.gutenberg.org/cache/epub/11/pg11.txt"
 raw_text = urlopen(url).read()

 tokens = nltk.word_tokenize(raw_text)
 freq_dist = nltk.FreqDist(tokens)
 #freq_dist.plot(50, cumulative=False)

 # === simple part of speech tagger ===
 pos_tokens = [(x[0].lower(),x[1]) for x in nltk.pos_tag(tokens)]

 # to list part of speech tagsets
 # nltk.help.upenn_tagset()
 # nltk.help.brown_tagset()

 # frequency distribution of pronouns (PRO)
 pronoun_freq_dist = nltk.FreqDist([x for x in pos_tokens if ( x[1] == "PRP" or x[1] == "PRP$")])
 #pronoun_freq_dist.plot(50, cumulative=False)

 # frequency distribution of adjectives (ADJ)
 adj_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "JJ" or x[1] == "JJR" or x[1] == "JJS"])
 #adj_freq_dist.plot(50, cumulative=False)

 #frequency distribution of adverbs
 adv_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "RB" or x[1] == "RBS" or x[1] == "RBR"])
 #adv_freq_dist.plot(50, cumulative=False)

 #frequency distribution of verbs
 verb_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "VB" or x[1] == "VBD" or x[1] == "VBG" or x[1] == "VBN" or x[1] == "VBP" or x[1] == "VBZ"])
 #verb_freq_dist.plot(50, cumulative=False)
	import nltk
	from urllib import urlopen

	# download alice in wonderland
	url = "http://www.gutenberg.org/cache/epub/11/pg11.txt"
	raw_text = urlopen(url).read()

	tokens = nltk.word_tokenize(raw_text)
	freq_dist = nltk.FreqDist(tokens)
	#freq_dist.plot(50, cumulative=False)

	# === simple part of speech tagger ===
	pos_tokens = [(x[0].lower(),x[1]) for x in nltk.pos_tag(tokens)]

	# to list part of speech tagsets
	# nltk.help.upenn_tagset()
	# nltk.help.brown_tagset()

	# frequency distribution of pronouns (PRO)
	pronoun_freq_dist = nltk.FreqDist([x for x in pos_tokens if ( x[1] == "PRP" or x[1] == "PRP$")])
	#pronoun_freq_dist.plot(50, cumulative=False)

	# frequency distribution of adjectives (ADJ)
	adj_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "JJ" or x[1] == "JJR" or x[1] == "JJS"])
	#adj_freq_dist.plot(50, cumulative=False)

	#frequency distribution of adverbs
	adv_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "RB" or x[1] == "RBS" or x[1] == "RBR"])
	#adv_freq_dist.plot(50, cumulative=False)

	#frequency distribution of verbs
	verb_freq_dist = nltk.FreqDist([x for x in pos_tokens if x[1] == "VB" or x[1] == "VBD" or x[1] == "VBG" or x[1] == "VBN" or x[1] == "VBP" or x[1] == "VBZ"])
	#verb_freq_dist.plot(50, cumulative=False)