language-engineering’s gists

language-engineering / gist:3938489

Created October 23, 2012 12:31

	from corpus_readers import AmazonReviewCorpusReader
	arcr = AmazonReviewCorpusReader()

	#Format some dvd data ready for the NB classifier
	dvd_pos_formatted = format_data(arcr.positive().category("dvd").documents(),"pos")
	dvd_neg_formatted = format_data(arcr.negative().category("dvd").documents(),"neg")
	dvd_formatted = dvd_pos_formatted + dvd_neg_formatted

	book_pos_formatted = format_data(arcr.positive().category("book").documents(),"pos")
	book_neg_formatted = format_data(arcr.negative().category("book").documents(),"neg")

language-engineering / gist:3930643

Last active October 11, 2015 22:38

	arcr = AmazonReviewCorpusReader()

	#Get some extra dvd data
	extra_dvd_positive = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() > 4.9 ]
	extra_dvd_negative = [r for r in arcr.unlabeled(["dvd"]).documents() if r.rating() < 1.1 ]

language-engineering / gist:3930626

Last active October 11, 2015 22:38

	from random import sample # sample is a function in Python's built-in random module

	training_data_subset = sample(training_data, k) # Selects a random sample of k reviews

language-engineering / gist:3905043

Last active October 11, 2015 18:58

	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	def format_data(reviews, label, feature_extraction_fn=None):
	if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
	data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
	else:
	data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
	return data

	#After you've split the data up as shown earlier, you can use the split data like this:

language-engineering / gist:3891874

Created October 15, 2012 10:30

	from corpus_readers import AmazonReviewCorpusReader

	def format_data(corpus_reader, label, feature_extraction_fn=None):

	if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
	data = [(dict([(feature, True) for feature in review.words()]), label) for review in corpus_reader.reviews()]
	else:
	data = [(dict([(feature, True) for feature in feature_extraction_fn(review.raw())]), label) for review in corpus_reader.reviews()]
	return data

language-engineering / gist:3891802

Last active October 11, 2015 17:08

	from random import sample
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader


	def split_data(data, ratio=0.7):
	data = list(data)

	n = len(data) #Found out number of samples present
	train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
	test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices

language-engineering / gist:3874681

Created October 11, 2012 18:52

	import os
	import nltk
	nltk.internals.config_java(os.path.join('C:\\','Program Files (x86)','Java','jre6','bin','java.exe'))

language-engineering / gist:3865592

Created October 10, 2012 13:16

	from nltk.classify import NaiveBayesClassifier
	from nltk.classify.util import accuracy

	#Train on a list of reviews
	nb_classifier = NaiveBayesClassifier.train(formatted_training_data)

	#Test on another list of reviews
	print "Accuracy:", accuracy(nb_classifier, formatted_testing_data)

	#Print the features that the NB classifier found to be most important in making classifications

language-engineering / gist:3865507

Created October 10, 2012 13:00

	import matplotlib.pyplot as plt
	import numpy as np

	#put your results here!
	results = [67, 54, 44, 33]

	ind = np.arange(len(results))
	width = 0.4
	p1 = plt.bar(ind, results, width, color="#1AADA4") #plot a bar graph
	plt.ylabel('Accuracy') #y axis label

language-engineering / gist:3865406

Last active October 11, 2015 13:27

	from sussex_nltk.stats import evaluate_wordlist_classifier
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	#Create a new classifier with your words lists
	book_classifier = SimpleClassifier(positive_book_words_list, negative_book_words_list)

	#Evaluate classifier
	#The function requires three arguments:
	# 1. Word list based classifer
	# 2. A list (or generator) of positive AmazonReview objects