Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Last active October 11, 2015 17:08
Show Gist options
  • Select an option

  • Save language-engineering/3891802 to your computer and use it in GitHub Desktop.

Select an option

Save language-engineering/3891802 to your computer and use it in GitHub Desktop.
from random import sample
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
def split_data(data, ratio=0.7):
data = list(data)
n = len(data) #Found out number of samples present
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices
training_data = [data[i] for i in train_indices] #Use training indices to select data
testing_data = [data[i] for i in test_indices] #Use testing indices to select data
return (training_data, testing_data) #Return split data
#Create an Amazon corpus reader pointing at only book reviews
book_reader = AmazonReviewCorpusReader().category("book")
#In order to get even random splits, where each data set is a list of Amazon Review objects.
pos_training_data, pos_testing_data = split_data(book_reader.positive().documents()) #See the note below this code snippet
neg_training_data, neg_testing_data = split_data(book_reader.negative().documents())
#You can also combine the training data
training_data = pos_training_data + neg_training_data
testing_data = pos_testing_data + neg_testing_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment