Skip to content

Instantly share code, notes, and snippets.

@language-engineering
Last active October 11, 2015 18:58
Show Gist options
  • Select an option

  • Save language-engineering/3905043 to your computer and use it in GitHub Desktop.

Select an option

Save language-engineering/3905043 to your computer and use it in GitHub Desktop.
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
def format_data(reviews, label, feature_extraction_fn=None):
if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features
data = [(dict([(feature, True) for feature in review.words()]), label) for review in reviews]
else:
data = [(dict([(feature, True) for feature in feature_extraction_fn(review)]), label) for review in reviews]
return data
#After you've split the data up as shown earlier, you can use the split data like this:
#Format the positive and negative separately
formatted_pos_training = format_data(pos_training_data, "pos")
formatted_neg_training = format_data(neg_training_data, "neg")
#Combine them
formatted_training_data = formatted_pos_training + formatted_neg_training
#Same again but for the testing data
formatted_pos_testing = format_data(pos_testing_data, "pos")
formatted_neg_testing = format_data(neg_testing_data, "neg")
#Combine them
formatted_testing_data = formatted_pos_testing + formatted_neg_testing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment