Created
March 11, 2018 18:21
-
-
Save balachandrapai/dbb16464d502572347cd131af7e27969 to your computer and use it in GitHub Desktop.
Extracting feature sets, Classification using NaiveBayes, Pickle basics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Text classification is always in the form of binary. | |
##ie, either spam or not spam | |
import nltk | |
import random | |
from nltk.corpus import movie_reviews | |
import pickle | |
## Create a list of tuples or features | |
documents = [(list(movie_reviews.words(fileid)), category) | |
for category in movie_reviews.categories() | |
for fileid in movie_reviews.fileids(category)] | |
##documents = [] | |
##for category in movie_reviews.categories(): | |
## for fileid in movie_reviews.fileids(category): | |
## documents.append(list(movie_reviews.words(fileid)), category) | |
random.shuffle(documents) | |
##print(documents[1]) | |
##Convert everything to lower case | |
all_words = [] | |
for w in movie_reviews.words(): | |
all_words.append(w.lower()) | |
##nltk freq distribution | |
all_words = nltk.FreqDist(all_words) | |
##print(all_words.most_common(15)) | |
##print(all_words["stupid"]) | |
##top 3,000 most common words | |
word_features = list(all_words.keys())[:3000] | |
##Find the words and mark their pressence as positive or negative | |
def find_features(document): | |
words = set(document) | |
features = {} | |
for w in word_features: | |
features[w] = (w in words) | |
return features | |
##Word which is true implies that it is common to appear in negative reviews | |
##print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) | |
##for all of our documents, saving the feature existence booleans and their | |
##respective positive or negative categories | |
featuresets = [(find_features(rev), category) for (rev, category) in documents] | |
# set that we'll train our classifier with | |
training_set = featuresets[:1900] | |
# set that we'll test against. | |
testing_set = featuresets[1900:] | |
##classifier = nltk.NaiveBayesClassifier.train(training_set) | |
##Calling the pickle file and using the saved classifier | |
classifier_f = open("naivebayes.pickle", "rb") | |
classifier = pickle.load(classifier_f) | |
classifier_f.close() | |
##Saving the classifiers as pickle object | |
##save_classifier = open("naivebayes.pickle","wb") | |
##pickle.dump(classifier, save_classifier) | |
##save_classifier.close() | |
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100) | |
classifier.show_most_informative_features(15) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment