language-engineering · October 31, 2012 11:53
diff --git a/gistfile1.py b/gistfile1.py
 from random import sample

 def split_data(data, ratio=0.7):
    data = list(data)

    n = len(data)  #Found out number of samples present
    train_indices = sample(xrange(n), int(n * ratio))          #Randomly select training indices
    test_indices = list(set(xrange(n)) - set(train_indices))   #Randomly select testing indices

    training_data = [data[i] for i in train_indices]           #Use training indices to select data
    testing_data = [data[i] for i in test_indices]             #Use testing indices to select data

    return (training_data, testing_data)                       #Return split data

 def format_data(reviews, label, feature_extraction_fn=None):
    if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
        data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
    else:
        data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
    return data
	from random import sample

	def split_data(data, ratio=0.7):
	data = list(data)

	n = len(data) #Found out number of samples present
	train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
	test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices

	training_data = [data[i] for i in train_indices] #Use training indices to select data
	testing_data = [data[i] for i in test_indices] #Use testing indices to select data

	return (training_data, testing_data) #Return split data

	def format_data(reviews, label, feature_extraction_fn=None):
	if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
	data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
	else:
	data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
	return data