language-engineering · October 11, 2015 18:58
diff --git a/gistfile1.py b/gistfile1.py
 from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

 def format_data(reviews, label, feature_extraction_fn=None):
    if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
        data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
    else:
        data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
    return data

 #After you've split the data up as shown earlier, you can use the split data like this:
 #Format the positive and negative separately
 formatted_pos_training = format_data(pos_training_data, "pos") 
 formatted_neg_training = format_data(neg_training_data, "neg") 
 #Combine them
 formatted_training_data = formatted_pos_training + formatted_neg_training

 #Same again but for the testing data
 formatted_pos_testing = format_data(pos_testing_data, "pos") 
 formatted_neg_testing = format_data(neg_testing_data, "neg") 
 #Combine them
 formatted_testing_data = formatted_pos_testing + formatted_neg_testing
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	def format_data(reviews, label, feature_extraction_fn=None):
	if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
	data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
	else:
	data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
	return data

	#After you've split the data up as shown earlier, you can use the split data like this:
	#Format the positive and negative separately
	formatted_pos_training = format_data(pos_training_data, "pos")
	formatted_neg_training = format_data(neg_training_data, "neg")
	#Combine them
	formatted_training_data = formatted_pos_training + formatted_neg_training

	#Same again but for the testing data
	formatted_pos_testing = format_data(pos_testing_data, "pos")
	formatted_neg_testing = format_data(neg_testing_data, "neg")
	#Combine them
	formatted_testing_data = formatted_pos_testing + formatted_neg_testing
No results found