gerbal · January 4, 2016 12:29
diff --git a/generate_sentence.py b/generate_sentence.py
 from nltk.grammar import  parse_cfg
 from nltk.parse import generate2
 grammar2 = parse_cfg(""" 
  S -> NP VP 
  NP -> Det Nom | PropN 
  Nom -> Adj Nom | N 
  VP -> V Adj | V NP | V S | V NP PP 
  PP -> P NP 
  PropN -> 'Buster' | 'Chatterer' | 'Joe' 
  Det -> 'the' | 'a' 
  N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log' 
  Adj -> 'angry' | 'frightened' | 'little' | 'tall' 
  V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put' 
  P -> 'on' 
  """)
 for n in generate2.generate(grammar2, None,5):
  print ' '.join(n)
diff --git a/sentiment1.py b/sentiment1.py
 import nltk.classify.util
 from nltk.classify import NaiveBayesClassifier
 from nltk.corpus import movie_reviews
 from pprint import pprint
  
 # from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

 def word_feats(words):
    return dict([(word, True) for word in words])
 
 negids = movie_reviews.fileids('neg')
 posids = movie_reviews.fileids('pos')
 
 negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
 posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
 negcutoff = len(negfeats)*3/4
 poscutoff = len(posfeats)*3/4
 
 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
 testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 print( 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))
 
 classifier = NaiveBayesClassifier.train(trainfeats)
 print( 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
 classifier.show_most_informative_features()
diff --git a/sentiment1.xoutput b/sentiment1.xoutput
 train on 1500 instances, test on 500 instances
 ('accuracy:', 0.728)
 Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
diff --git a/sentiment2.py b/sentiment2.py
 # From http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/

 import collections, itertools
 import nltk.classify.util, nltk.metrics
 from nltk.classify import NaiveBayesClassifier
 from nltk.corpus import movie_reviews, stopwords
 from nltk.collocations import BigramCollocationFinder
 from nltk.metrics import BigramAssocMeasures
 from nltk.probability import FreqDist, ConditionalFreqDist
 
 def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
 
 def word_feats(words):
    return dict([(word, True) for word in words])
 
 print 'evaluating single word features'
 evaluate_classifier(word_feats)
 
 word_fd = FreqDist()
 label_word_fd = ConditionalFreqDist()
 
 for word in movie_reviews.words(categories=['pos']):
    word_fd.inc(word.lower())
    label_word_fd['pos'].inc(word.lower())
 
 for word in movie_reviews.words(categories=['neg']):
    word_fd.inc(word.lower())
    label_word_fd['neg'].inc(word.lower())
 
 # n_ii = label_word_fd[label][word]
 # n_ix = word_fd[word]
 # n_xi = label_word_fd[label].N()
 # n_xx = label_word_fd.N()
 
 pos_word_count = label_word_fd['pos'].N()
 neg_word_count = label_word_fd['neg'].N()
 total_word_count = pos_word_count + neg_word_count
 
 word_scores = {}
 
 for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score
 
 best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
 bestwords = set([w for w, s in best])
 
 def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
 
 print 'evaluating best word features'
 evaluate_classifier(best_word_feats)
 
 def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
 
 print 'evaluating best words + bigram chi_sq word features'
 evaluate_classifier(best_bigram_word_feats)
diff --git a/sentiment2.xoutput b/sentiment2.xoutput
 evaluating single word features
 accuracy: 0.728
 pos precision: 0.651595744681
 pos recall: 0.98
 neg precision: 0.959677419355
 neg recall: 0.476
 Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
 evaluating best word features
 accuracy: 0.93
 pos precision: 0.890909090909
 pos recall: 0.98
 neg precision: 0.977777777778
 neg recall: 0.88
 Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
 evaluating best words + bigram chi_sq word features
 accuracy: 0.922
 pos precision: 0.916996047431
 pos recall: 0.928
 neg precision: 0.927125506073
 neg recall: 0.916
 Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
       ('matt', 'damon') = True              pos : neg    =     12.3 : 1.0
          ('give', 'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
    ('absolutely', 'no') = True              neg : pos    =     10.6 : 1.0
	from nltk.grammar import parse_cfg
	from nltk.parse import generate2
	grammar2 = parse_cfg("""
	S -> NP VP
	NP -> Det Nom \| PropN
	Nom -> Adj Nom \| N
	VP -> V Adj \| V NP \| V S \| V NP PP
	PP -> P NP
	PropN -> 'Buster' \| 'Chatterer' \| 'Joe'
	Det -> 'the' \| 'a'
	N -> 'bear' \| 'squirrel' \| 'tree' \| 'fish' \| 'log'
	Adj -> 'angry' \| 'frightened' \| 'little' \| 'tall'
	V -> 'chased' \| 'saw' \| 'said' \| 'thought' \| 'was' \| 'put'
	P -> 'on'
	""")
	for n in generate2.generate(grammar2, None,5):
	print ' '.join(n)
	import nltk.classify.util
	from nltk.classify import NaiveBayesClassifier
	from nltk.corpus import movie_reviews
	from pprint import pprint

	# from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

	def word_feats(words):
	return dict([(word, True) for word in words])

	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')

	negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

	negcutoff = len(negfeats)*3/4
	poscutoff = len(posfeats)*3/4

	trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
	testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
	print( 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

	classifier = NaiveBayesClassifier.train(trainfeats)
	print( 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
	classifier.show_most_informative_features()
	train on 1500 instances, test on 500 instances
	('accuracy:', 0.728)
	Most Informative Features
	magnificent = True pos : neg = 15.0 : 1.0
	outstanding = True pos : neg = 13.6 : 1.0
	insulting = True neg : pos = 13.0 : 1.0
	vulnerable = True pos : neg = 12.3 : 1.0
	ludicrous = True neg : pos = 11.8 : 1.0
	avoids = True pos : neg = 11.7 : 1.0
	uninvolving = True neg : pos = 11.7 : 1.0
	astounding = True pos : neg = 10.3 : 1.0
	fascination = True pos : neg = 10.3 : 1.0
	idiotic = True neg : pos = 9.8 : 1.0
	# From http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/

	import collections, itertools
	import nltk.classify.util, nltk.metrics
	from nltk.classify import NaiveBayesClassifier
	from nltk.corpus import movie_reviews, stopwords
	from nltk.collocations import BigramCollocationFinder
	from nltk.metrics import BigramAssocMeasures
	from nltk.probability import FreqDist, ConditionalFreqDist

	def evaluate_classifier(featx):
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')

	negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

	negcutoff = len(negfeats)*3/4
	poscutoff = len(posfeats)*3/4

	trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
	testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

	classifier = NaiveBayesClassifier.train(trainfeats)
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)

	for i, (feats, label) in enumerate(testfeats):
	refsets[label].add(i)
	observed = classifier.classify(feats)
	testsets[observed].add(i)

	print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
	print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
	print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
	print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
	print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
	classifier.show_most_informative_features()

	def word_feats(words):
	return dict([(word, True) for word in words])

	print 'evaluating single word features'
	evaluate_classifier(word_feats)

	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()

	for word in movie_reviews.words(categories=['pos']):
	word_fd.inc(word.lower())
	label_word_fd['pos'].inc(word.lower())

	for word in movie_reviews.words(categories=['neg']):
	word_fd.inc(word.lower())
	label_word_fd['neg'].inc(word.lower())

	# n_ii = label_word_fd[label][word]
	# n_ix = word_fd[word]
	# n_xi = label_word_fd[label].N()
	# n_xx = label_word_fd.N()

	pos_word_count = label_word_fd['pos'].N()
	neg_word_count = label_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count

	word_scores = {}

	for word, freq in word_fd.iteritems():
	pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
	(freq, pos_word_count), total_word_count)
	neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
	(freq, neg_word_count), total_word_count)
	word_scores[word] = pos_score + neg_score

	best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
	bestwords = set([w for w, s in best])

	def best_word_feats(words):
	return dict([(word, True) for word in words if word in bestwords])

	print 'evaluating best word features'
	evaluate_classifier(best_word_feats)

	def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
	bigram_finder = BigramCollocationFinder.from_words(words)
	bigrams = bigram_finder.nbest(score_fn, n)
	d = dict([(bigram, True) for bigram in bigrams])
	d.update(best_word_feats(words))
	return d

	print 'evaluating best words + bigram chi_sq word features'
	evaluate_classifier(best_bigram_word_feats)
	evaluating single word features
	accuracy: 0.728
	pos precision: 0.651595744681
	pos recall: 0.98
	neg precision: 0.959677419355
	neg recall: 0.476
	Most Informative Features
	magnificent = True pos : neg = 15.0 : 1.0
	outstanding = True pos : neg = 13.6 : 1.0
	insulting = True neg : pos = 13.0 : 1.0
	vulnerable = True pos : neg = 12.3 : 1.0
	ludicrous = True neg : pos = 11.8 : 1.0
	avoids = True pos : neg = 11.7 : 1.0
	uninvolving = True neg : pos = 11.7 : 1.0
	astounding = True pos : neg = 10.3 : 1.0
	fascination = True pos : neg = 10.3 : 1.0
	idiotic = True neg : pos = 9.8 : 1.0
	evaluating best word features
	accuracy: 0.93
	pos precision: 0.890909090909
	pos recall: 0.98
	neg precision: 0.977777777778
	neg recall: 0.88
	Most Informative Features
	magnificent = True pos : neg = 15.0 : 1.0
	outstanding = True pos : neg = 13.6 : 1.0
	insulting = True neg : pos = 13.0 : 1.0
	vulnerable = True pos : neg = 12.3 : 1.0
	ludicrous = True neg : pos = 11.8 : 1.0
	avoids = True pos : neg = 11.7 : 1.0
	uninvolving = True neg : pos = 11.7 : 1.0
	fascination = True pos : neg = 10.3 : 1.0
	astounding = True pos : neg = 10.3 : 1.0
	idiotic = True neg : pos = 9.8 : 1.0
	evaluating best words + bigram chi_sq word features
	accuracy: 0.922
	pos precision: 0.916996047431
	pos recall: 0.928
	neg precision: 0.927125506073
	neg recall: 0.916
	Most Informative Features
	magnificent = True pos : neg = 15.0 : 1.0
	outstanding = True pos : neg = 13.6 : 1.0
	insulting = True neg : pos = 13.0 : 1.0
	vulnerable = True pos : neg = 12.3 : 1.0
	('matt', 'damon') = True pos : neg = 12.3 : 1.0
	('give', 'us') = True neg : pos = 12.3 : 1.0
	ludicrous = True neg : pos = 11.8 : 1.0
	uninvolving = True neg : pos = 11.7 : 1.0
	avoids = True pos : neg = 11.7 : 1.0
	('absolutely', 'no') = True neg : pos = 10.6 : 1.0