Created
October 15, 2012 10:30
-
-
Save language-engineering/3891874 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from corpus_readers import AmazonReviewCorpusReader | |
| def format_data(corpus_reader, label, feature_extraction_fn=None): | |
| if feature_extraction_fn is None: #If a feature extraction function is not provided, use simply the words of the review as features | |
| data = [(dict([(feature, True) for feature in review.words()]), label) for review in corpus_reader.reviews()] | |
| else: | |
| data = [(dict([(feature, True) for feature in feature_extraction_fn(review.raw())]), label) for review in corpus_reader.reviews()] | |
| return data | |
| # Usage: | |
| #A corpus reader pointing at dvd reviews | |
| dvd_reader = AmazonReviewCorpusReader().category("dvd") | |
| #Formatted positive and negative data | |
| pos_data = format_data(dvd_reader.positive(), "pos") | |
| neg_data = format_data(dvd_reader.negative(), "neg") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment