This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from corpus_readers import AmazonReviewCorpusReader | |
| arcr = AmazonReviewCorpusReader() | |
| #Format some dvd data ready for the NB classifier | |
| dvd_pos_formatted = format_data(arcr.positive().category("dvd").documents(),"pos") | |
| dvd_neg_formatted = format_data(arcr.negative().category("dvd").documents(),"neg") | |
| dvd_formatted = dvd_pos_formatted + dvd_neg_formatted | |
| book_pos_formatted = format_data(arcr.positive().category("book").documents(),"pos") | |
| book_neg_formatted = format_data(arcr.negative().category("book").documents(),"neg") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| arcr = AmazonReviewCorpusReader() | |
| #Get some extra dvd data | |
| extra_dvd_positive = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() > 4.9 ] | |
| extra_dvd_negative = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() < 1.1 ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from random import sample # sample is a function in Python's built-in random module | |
| training_data_subset = sample(training_data, k) # Selects a random sample of k reviews |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
| def format_data(reviews, label, feature_extraction_fn=None): | |
| if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features | |
| data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews] | |
| else: | |
| data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews] | |
| return data | |
| #After you've split the data up as shown earlier, you can use the split data like this: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from corpus_readers import AmazonReviewCorpusReader | |
| def format_data(corpus_reader, label, feature_extraction_fn=None): | |
| if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features | |
| data = [(dict([(feature, True) for feature in review.words()]), label) for review in corpus_reader.reviews()] | |
| else: | |
| data = [(dict([(feature, True) for feature in feature_extraction_fn(review.raw())]), label) for review in corpus_reader.reviews()] | |
| return data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from random import sample | |
| from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
| def split_data(data, ratio=0.7): | |
| data = list(data) | |
| n = len(data) #Found out number of samples present | |
| train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices | |
| test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import nltk | |
| nltk.internals.config_java(os.path.join('C:\\','Program Files (x86)','Java','jre6','bin','java.exe')) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.classify import NaiveBayesClassifier | |
| from nltk.classify.util import accuracy | |
| #Train on a list of reviews | |
| nb_classifier = NaiveBayesClassifier.train(formatted_training_data) | |
| #Test on another list of reviews | |
| print "Accuracy:", accuracy(nb_classifier, formatted_testing_data) | |
| #Print the features that the NB classifier found to be most important in making classifications |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| #put your results here! | |
| results = [67, 54, 44, 33] | |
| ind = np.arange(len(results)) | |
| width = 0.4 | |
| p1 = plt.bar(ind, results, width, color="#1AADA4") #plot a bar graph | |
| plt.ylabel('Accuracy') #y axis label |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.stats import evaluate_wordlist_classifier | |
| from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
| #Create a new classifier with your words lists | |
| book_classifier = SimpleClassifier(positive_book_words_list, negative_book_words_list) | |
| #Evaluate classifier | |
| #The function requires three arguments: | |
| # 1. Word list based classifer | |
| # 2. A list (or generator) of positive AmazonReview objects |