Skip to content

Instantly share code, notes, and snippets.

@balachandrapai
Created March 11, 2018 18:21
Show Gist options
  • Save balachandrapai/dbb16464d502572347cd131af7e27969 to your computer and use it in GitHub Desktop.
Save balachandrapai/dbb16464d502572347cd131af7e27969 to your computer and use it in GitHub Desktop.
Extracting feature sets, Classification using NaiveBayes, Pickle basics
##Text classification is always in the form of binary.
##ie, either spam or not spam
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
## Create a list of tuples or features
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
##documents = []
##for category in movie_reviews.categories():
## for fileid in movie_reviews.fileids(category):
## documents.append(list(movie_reviews.words(fileid)), category)
random.shuffle(documents)
##print(documents[1])
##Convert everything to lower case
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
##nltk freq distribution
all_words = nltk.FreqDist(all_words)
##print(all_words.most_common(15))
##print(all_words["stupid"])
##top 3,000 most common words
word_features = list(all_words.keys())[:3000]
##Find the words and mark their pressence as positive or negative
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
##Word which is true implies that it is common to appear in negative reviews
##print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
##for all of our documents, saving the feature existence booleans and their
##respective positive or negative categories
featuresets = [(find_features(rev), category) for (rev, category) in documents]
# set that we'll train our classifier with
training_set = featuresets[:1900]
# set that we'll test against.
testing_set = featuresets[1900:]
##classifier = nltk.NaiveBayesClassifier.train(training_set)
##Calling the pickle file and using the saved classifier
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
##Saving the classifiers as pickle object
##save_classifier = open("naivebayes.pickle","wb")
##pickle.dump(classifier, save_classifier)
##save_classifier.close()
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment