language-engineering · October 23, 2012 15:15
diff --git a/gistfile1.py b/gistfile1.py
 #Your function may start out like this, equivalent to what's been used to far
 #It takes a review, and just returns all the words in that review
 def feature_extractor(amazon_review):
    return amazon_review.words() #AmazonReview objects have a method *words* which simply returns all the words in the review

 # Below follows example functionality that you should include in your feature extractor

 #This code shows you how to get lowercase versions of all the words
 tokens = ['You', 'know', 'NOTHING,', 'Jon', 'Snow']
 print [token.lower() for token in tokens]

 #Replace all number tokens with "NUM"
 numbers = ['in', 'the', 'year', '120', 'of', 'the', 'fourth', 'age', ',', 'after', '120', 'years', 'as', 'king', ',' , 'aragorn', 'died', 'at', 'the', 'age', 'of', '210']
 print ["NUM" if token.isdigit() else token for token in numbers]

 #This code shows you how to filter out non-alphabetic words and stopwords.
 from nltk.corpus import stopwords
 print [w for w in tokens if w.isalpha() and w not in stopwords.words('english')]
	#Your function may start out like this, equivalent to what's been used to far
	#It takes a review, and just returns all the words in that review
	def feature_extractor(amazon_review):
	return amazon_review.words() #AmazonReview objects have a method words which simply returns all the words in the review

	# Below follows example functionality that you should include in your feature extractor

	#This code shows you how to get lowercase versions of all the words
	tokens = ['You', 'know', 'NOTHING,', 'Jon', 'Snow']
	print [token.lower() for token in tokens]

	#Replace all number tokens with "NUM"
	numbers = ['in', 'the', 'year', '120', 'of', 'the', 'fourth', 'age', ',', 'after', '120', 'years', 'as', 'king', ',' , 'aragorn', 'died', 'at', 'the', 'age', 'of', '210']
	print ["NUM" if token.isdigit() else token for token in numbers]

	#This code shows you how to filter out non-alphabetic words and stopwords.
	from nltk.corpus import stopwords
	print [w for w in tokens if w.isalpha() and w not in stopwords.words('english')]
No results found