Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Created October 31, 2012 11:53
Show Gist options
  • Save language-engineering/3986653 to your computer and use it in GitHub Desktop.
Save language-engineering/3986653 to your computer and use it in GitHub Desktop.
from random import sample
def split_data(data, ratio=0.7):
data = list(data)
n = len(data) #Found out number of samples present
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices
training_data = [data[i] for i in train_indices] #Use training indices to select data
testing_data = [data[i] for i in test_indices] #Use testing indices to select data
return (training_data, testing_data) #Return split data
def format_data(reviews, label, feature_extraction_fn=None):
if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
else:
data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment