Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from corpus_readers import AmazonReviewCorpusReader
arcr = AmazonReviewCorpusReader()
#Format some dvd data ready for the NB classifier
dvd_pos_formatted = format_data(arcr.positive().category("dvd").documents(),"pos")
dvd_neg_formatted = format_data(arcr.negative().category("dvd").documents(),"neg")
dvd_formatted = dvd_pos_formatted + dvd_neg_formatted
book_pos_formatted = format_data(arcr.positive().category("book").documents(),"pos")
book_neg_formatted = format_data(arcr.negative().category("book").documents(),"neg")
arcr = AmazonReviewCorpusReader()
#Get some extra dvd data
extra_dvd_positive = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() > 4.9 ]
extra_dvd_negative = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() < 1.1 ]
from random import sample # sample is a function in Python's built-in random module
training_data_subset = sample(training_data, k) # Selects a random sample of k reviews
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
def format_data(reviews, label, feature_extraction_fn=None):
if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
else:
data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
return data
#After you've split the data up as shown earlier, you can use the split data like this:
from corpus_readers import AmazonReviewCorpusReader
def format_data(corpus_reader, label, feature_extraction_fn=None):
if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
data = [(dict([(feature, True) for feature in review.words()]), label) for review in corpus_reader.reviews()]
else:
data = [(dict([(feature, True) for feature in feature_extraction_fn(review.raw())]), label) for review in corpus_reader.reviews()]
return data
from random import sample
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
def split_data(data, ratio=0.7):
data = list(data)
n = len(data) #Found out number of samples present
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices
import os
import nltk
nltk.internals.config_java(os.path.join('C:\\','Program Files (x86)','Java','jre6','bin','java.exe'))
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
#Train on a list of reviews
nb_classifier = NaiveBayesClassifier.train(formatted_training_data)
#Test on another list of reviews
print "Accuracy:", accuracy(nb_classifier, formatted_testing_data)
#Print the features that the NB classifier found to be most important in making classifications
import matplotlib.pyplot as plt
import numpy as np
#put your results here!
results = [67, 54, 44, 33]
ind = np.arange(len(results))
width = 0.4
p1 = plt.bar(ind, results, width, color="#1AADA4") #plot a bar graph
plt.ylabel('Accuracy') #y axis label
from sussex_nltk.stats import evaluate_wordlist_classifier
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
#Create a new classifier with your words lists
book_classifier = SimpleClassifier(positive_book_words_list, negative_book_words_list)
#Evaluate classifier
#The function requires three arguments:
# 1. Word list based classifer
# 2. A list (or generator) of positive AmazonReview objects