Last active
January 4, 2016 12:29
-
-
Save gerbal/8621599 to your computer and use it in GitHub Desktop.
For NLP Presentation about NLTK
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.grammar import parse_cfg | |
from nltk.parse import generate2 | |
grammar2 = parse_cfg(""" | |
S -> NP VP | |
NP -> Det Nom | PropN | |
Nom -> Adj Nom | N | |
VP -> V Adj | V NP | V S | V NP PP | |
PP -> P NP | |
PropN -> 'Buster' | 'Chatterer' | 'Joe' | |
Det -> 'the' | 'a' | |
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log' | |
Adj -> 'angry' | 'frightened' | 'little' | 'tall' | |
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put' | |
P -> 'on' | |
""") | |
for n in generate2.generate(grammar2, None,5): | |
print ' '.join(n) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk.classify.util | |
from nltk.classify import NaiveBayesClassifier | |
from nltk.corpus import movie_reviews | |
from pprint import pprint | |
# from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
negcutoff = len(negfeats)*3/4 | |
poscutoff = len(posfeats)*3/4 | |
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] | |
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] | |
print( 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) | |
classifier = NaiveBayesClassifier.train(trainfeats) | |
print( 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)) | |
classifier.show_most_informative_features() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train on 1500 instances, test on 500 instances | |
('accuracy:', 0.728) | |
Most Informative Features | |
magnificent = True pos : neg = 15.0 : 1.0 | |
outstanding = True pos : neg = 13.6 : 1.0 | |
insulting = True neg : pos = 13.0 : 1.0 | |
vulnerable = True pos : neg = 12.3 : 1.0 | |
ludicrous = True neg : pos = 11.8 : 1.0 | |
avoids = True pos : neg = 11.7 : 1.0 | |
uninvolving = True neg : pos = 11.7 : 1.0 | |
astounding = True pos : neg = 10.3 : 1.0 | |
fascination = True pos : neg = 10.3 : 1.0 | |
idiotic = True neg : pos = 9.8 : 1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/ | |
import collections, itertools | |
import nltk.classify.util, nltk.metrics | |
from nltk.classify import NaiveBayesClassifier | |
from nltk.corpus import movie_reviews, stopwords | |
from nltk.collocations import BigramCollocationFinder | |
from nltk.metrics import BigramAssocMeasures | |
from nltk.probability import FreqDist, ConditionalFreqDist | |
def evaluate_classifier(featx): | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
negcutoff = len(negfeats)*3/4 | |
poscutoff = len(posfeats)*3/4 | |
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] | |
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] | |
classifier = NaiveBayesClassifier.train(trainfeats) | |
refsets = collections.defaultdict(set) | |
testsets = collections.defaultdict(set) | |
for i, (feats, label) in enumerate(testfeats): | |
refsets[label].add(i) | |
observed = classifier.classify(feats) | |
testsets[observed].add(i) | |
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) | |
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) | |
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) | |
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) | |
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) | |
classifier.show_most_informative_features() | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
print 'evaluating single word features' | |
evaluate_classifier(word_feats) | |
word_fd = FreqDist() | |
label_word_fd = ConditionalFreqDist() | |
for word in movie_reviews.words(categories=['pos']): | |
word_fd.inc(word.lower()) | |
label_word_fd['pos'].inc(word.lower()) | |
for word in movie_reviews.words(categories=['neg']): | |
word_fd.inc(word.lower()) | |
label_word_fd['neg'].inc(word.lower()) | |
# n_ii = label_word_fd[label][word] | |
# n_ix = word_fd[word] | |
# n_xi = label_word_fd[label].N() | |
# n_xx = label_word_fd.N() | |
pos_word_count = label_word_fd['pos'].N() | |
neg_word_count = label_word_fd['neg'].N() | |
total_word_count = pos_word_count + neg_word_count | |
word_scores = {} | |
for word, freq in word_fd.iteritems(): | |
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], | |
(freq, pos_word_count), total_word_count) | |
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], | |
(freq, neg_word_count), total_word_count) | |
word_scores[word] = pos_score + neg_score | |
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] | |
bestwords = set([w for w, s in best]) | |
def best_word_feats(words): | |
return dict([(word, True) for word in words if word in bestwords]) | |
print 'evaluating best word features' | |
evaluate_classifier(best_word_feats) | |
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): | |
bigram_finder = BigramCollocationFinder.from_words(words) | |
bigrams = bigram_finder.nbest(score_fn, n) | |
d = dict([(bigram, True) for bigram in bigrams]) | |
d.update(best_word_feats(words)) | |
return d | |
print 'evaluating best words + bigram chi_sq word features' | |
evaluate_classifier(best_bigram_word_feats) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
evaluating single word features | |
accuracy: 0.728 | |
pos precision: 0.651595744681 | |
pos recall: 0.98 | |
neg precision: 0.959677419355 | |
neg recall: 0.476 | |
Most Informative Features | |
magnificent = True pos : neg = 15.0 : 1.0 | |
outstanding = True pos : neg = 13.6 : 1.0 | |
insulting = True neg : pos = 13.0 : 1.0 | |
vulnerable = True pos : neg = 12.3 : 1.0 | |
ludicrous = True neg : pos = 11.8 : 1.0 | |
avoids = True pos : neg = 11.7 : 1.0 | |
uninvolving = True neg : pos = 11.7 : 1.0 | |
astounding = True pos : neg = 10.3 : 1.0 | |
fascination = True pos : neg = 10.3 : 1.0 | |
idiotic = True neg : pos = 9.8 : 1.0 | |
evaluating best word features | |
accuracy: 0.93 | |
pos precision: 0.890909090909 | |
pos recall: 0.98 | |
neg precision: 0.977777777778 | |
neg recall: 0.88 | |
Most Informative Features | |
magnificent = True pos : neg = 15.0 : 1.0 | |
outstanding = True pos : neg = 13.6 : 1.0 | |
insulting = True neg : pos = 13.0 : 1.0 | |
vulnerable = True pos : neg = 12.3 : 1.0 | |
ludicrous = True neg : pos = 11.8 : 1.0 | |
avoids = True pos : neg = 11.7 : 1.0 | |
uninvolving = True neg : pos = 11.7 : 1.0 | |
fascination = True pos : neg = 10.3 : 1.0 | |
astounding = True pos : neg = 10.3 : 1.0 | |
idiotic = True neg : pos = 9.8 : 1.0 | |
evaluating best words + bigram chi_sq word features | |
accuracy: 0.922 | |
pos precision: 0.916996047431 | |
pos recall: 0.928 | |
neg precision: 0.927125506073 | |
neg recall: 0.916 | |
Most Informative Features | |
magnificent = True pos : neg = 15.0 : 1.0 | |
outstanding = True pos : neg = 13.6 : 1.0 | |
insulting = True neg : pos = 13.0 : 1.0 | |
vulnerable = True pos : neg = 12.3 : 1.0 | |
('matt', 'damon') = True pos : neg = 12.3 : 1.0 | |
('give', 'us') = True neg : pos = 12.3 : 1.0 | |
ludicrous = True neg : pos = 11.8 : 1.0 | |
uninvolving = True neg : pos = 11.7 : 1.0 | |
avoids = True pos : neg = 11.7 : 1.0 | |
('absolutely', 'no') = True neg : pos = 10.6 : 1.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment