Created
October 31, 2012 11:53
-
-
Save language-engineering/3986653 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import sample | |
def split_data(data, ratio=0.7): | |
data = list(data) | |
n = len(data) #Found out number of samples present | |
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices | |
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices | |
training_data = [data[i] for i in train_indices] #Use training indices to select data | |
testing_data = [data[i] for i in test_indices] #Use testing indices to select data | |
return (training_data, testing_data) #Return split data | |
def format_data(reviews, label, feature_extraction_fn=None): | |
if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features | |
data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews] | |
else: | |
data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews] | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment