language-engineering · October 11, 2015 17:08
diff --git a/gistfile1.py b/gistfile1.py
 from random import sample
 from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

 
 def split_data(data, ratio=0.7):
    data = list(data)
 
    n = len(data)  #Found out number of samples present
    train_indices = sample(xrange(n), int(n * ratio))          #Randomly select training indices
    test_indices = list(set(xrange(n)) - set(train_indices))   #Randomly select testing indices
 
    training_data = [data[i] for i in train_indices]           #Use training indices to select data
    testing_data = [data[i] for i in test_indices]             #Use testing indices to select data
 
    return (training_data, testing_data)                       #Return split data
 
 #Create an Amazon corpus reader pointing at only book reviews
 book_reader = AmazonReviewCorpusReader().category("book")

 #In order to get even random splits, where each data set is a list of Amazon Review objects.
 pos_training_data, pos_testing_data = split_data(book_reader.positive().documents()) #See the note below this code snippet 
 neg_training_data, neg_testing_data = split_data(book_reader.negative().documents())

 #You can also combine the training data
 training_data = pos_training_data + neg_training_data
 testing_data = pos_testing_data + neg_testing_data
	from random import sample
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader


	def split_data(data, ratio=0.7):
	data = list(data)

	n = len(data) #Found out number of samples present
	train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
	test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices

	training_data = [data[i] for i in train_indices] #Use training indices to select data
	testing_data = [data[i] for i in test_indices] #Use testing indices to select data

	return (training_data, testing_data) #Return split data

	#Create an Amazon corpus reader pointing at only book reviews
	book_reader = AmazonReviewCorpusReader().category("book")

	#In order to get even random splits, where each data set is a list of Amazon Review objects.
	pos_training_data, pos_testing_data = split_data(book_reader.positive().documents()) #See the note below this code snippet
	neg_training_data, neg_testing_data = split_data(book_reader.negative().documents())

	#You can also combine the training data
	training_data = pos_training_data + neg_training_data
	testing_data = pos_testing_data + neg_testing_data
No results found