shengch02 · December 23, 2016 20:17
diff --git a/Classifying sentiment of review with logistic regression b/Classifying sentiment of review with logistic regression
 #the dataset consists of baby product reviews on Amazon.com
 #link for data: https://d18ky98rnyall9.cloudfront.net/_35bdebdff61378878ea2247780005e52_amazon_baby.gl.zip?Expires=1482278400&Signature=blPJv6YQNFgcZh~dULuDECzZlA6eGL1x9lzQKzHknqVHSdudmfjq0XPaokFjv-~Qy8nGADiBBdx4ar0BWgeboW1eTkYHOZzoUIMBfSPQGqA4Q9H8X8vwFyr9R-TC0LE4h4CsTRFH56BtbqpKtjKeJKxVv5E5LfZZiyhZEr6We5M_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A
 import sframe
 products = sframe.SFrame('amazon_baby.gl/')

 #clean the original data: remove punctuation, fill in N/A, remove neutral sentiment,
 # perform a train/test split, produce word count matrix 
 def remove_punctuation(text):
 	import string
 	return text.translate(None, string.punctuation)
 products['review_clean']=products['review'].apply(remove_punctuation)
 products = products.fillna('review','')
 products = products[products['rating'] != 3]
 products['sentiment']=products['rating'].apply(lambda rating : +1
 	if rating > 3 else -1)
 train_data, test_data = products.random_split(0.8, seed=1)
 from sklearn.feature_extraction.text import CountVectorizer
 vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
 train_matrix = vectorizer.fit_transform(train_data['review_clean'])
 test_matrix = vectorizer.transform(test_data['review_clean'])
 print train_matrix

 #scikit learn, LogisticRegression(), training
 from sklearn.linear_model import LogisticRegression
 sentiment_model = LogisticRegression()
 sentiment_model.fit(train_matrix, train_data['sentiment'])
 print(len(sentiment_model.coef_[0])) # number of coefficients/weights, 121712
 print(sum(x >=0 for x in sentiment_model.coef_[0])) # number of positive weights, 86785

 #making predictions with logistic regression
 sample_test_data = test_data[10:13]
 print sample_test_data
 print sample_test_data[0]['review']
 print sample_test_data[1]['review']
 sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
 scores = sentiment_model.decision_function(sample_test_matrix)
 print scores # positive scores correspond to positive class prediction, 10 gets the most positive review
 score_pos = sentiment_model.decision_function(test_matrix) # score of positive
 prob = sentiment_model.predict_proba(test_matrix) # probability of negative & positive prediction
 pred = sentiment_model.predict(test_matrix) # prediction
 test_data.add_column(sframe.SArray(prob[:,0]), name='negprob')
 test_data.add_column(sframe.SArray(prob[:,1]), name='posprob')
 test_data.add_column(sframe.SArray(pred), name='predt')
 print test_data.sort('negprob', ascending=False).head(20)['name']  # top 20 negative prediction
 print test_data.sort('posprob', ascending=False).head(20)['name']  # top 20 positive prediction
 print (sum(test_data['sentiment']==test_data['predt'])+0.0)/len(test_data)  # prediction accuracy 0.932 for test data

 #train a simpler logistic regression model using only a subet of words
 significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
 vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
 train_matrix_word_subset = vectorizer_word_subset.fit_transform(
 	train_data['review_clean'])
 test_matrix_word_subset = vectorizer_word_subset.transform(
 	test_data['review_clean'])
 simple_model = LogisticRegression()
 simple_model.fit(train_matrix_word_subset, train_data['sentiment'])
 simple_model_coef_table = sframe.SFrame({'word':significant_words, 
 	'coefficient':simple_model.coef_.flatten()})   # 10 positive coefficients
 print('check sign of coefficients')
 for i in range(len(significant_words)):
 	idx = vectorizer.vocabulary_.keys().index(significant_words[i])
        print sentiment_model.coef_[0, idx]*simple_model.coef_[0, i]   # check the sign of coefficients corresponding to the same feature

 #compare the accurancy of sentiment_model vs. simple_model
 predt_train_data_sentiment = sentiment_model.predict(train_matrix)
 train_data.add_column(sframe.SArray(predt_train_data_sentiment), name='predt_sentiment')
 predt_train_data_simple = simple_model.predict(train_matrix_word_subset)
 train_data.add_column(sframe.SArray(predt_train_data_simple), name='predt_simple')
 print('sentiment accuracy for train data')
 print (sum(train_data['sentiment']==train_data['predt_sentiment'])+0.0)/len(train_data) # 0.9685
 print('simple accuracy for train data')
 print (sum(train_data['sentiment']==train_data['predt_simple'])+0.0)/len(train_data) # 0.8668
 predt_test_data_simple = simple_model.predict(test_matrix_word_subset)
 test_data.add_column(sframe.SArray(predt_test_data_simple), name='predt_simple')
 print('simple accuracy for test data')
 print (sum(test_data['sentiment']==test_data['predt_simple'])+0.0)/len(test_data) # 0.86936

 #majority class classifier
 from collections import Counter
 majority_predt = Counter(train_data['sentiment']).most_common(1)[0][0]
 print('accuracy of majority class classifier')
 print (sum(test_data['sentiment']==majority_predt)+0.0)/len(test_data) # 0.84278
	#the dataset consists of baby product reviews on Amazon.com
	#link for data: https://d18ky98rnyall9.cloudfront.net/_35bdebdff61378878ea2247780005e52_amazon_baby.gl.zip?Expires=1482278400&Signature=blPJv6YQNFgcZh~dULuDECzZlA6eGL1x9lzQKzHknqVHSdudmfjq0XPaokFjv-~Qy8nGADiBBdx4ar0BWgeboW1eTkYHOZzoUIMBfSPQGqA4Q9H8X8vwFyr9R-TC0LE4h4CsTRFH56BtbqpKtjKeJKxVv5E5LfZZiyhZEr6We5M_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A
	import sframe
	products = sframe.SFrame('amazon_baby.gl/')

	#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment,
	# perform a train/test split, produce word count matrix
	def remove_punctuation(text):
	import string
	return text.translate(None, string.punctuation)
	products['review_clean']=products['review'].apply(remove_punctuation)
	products = products.fillna('review','')
	products = products[products['rating'] != 3]
	products['sentiment']=products['rating'].apply(lambda rating : +1
	if rating > 3 else -1)
	train_data, test_data = products.random_split(0.8, seed=1)
	from sklearn.feature_extraction.text import CountVectorizer
	vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
	train_matrix = vectorizer.fit_transform(train_data['review_clean'])
	test_matrix = vectorizer.transform(test_data['review_clean'])
	print train_matrix

	#scikit learn, LogisticRegression(), training
	from sklearn.linear_model import LogisticRegression
	sentiment_model = LogisticRegression()
	sentiment_model.fit(train_matrix, train_data['sentiment'])
	print(len(sentiment_model.coef_[0])) # number of coefficients/weights, 121712
	print(sum(x >=0 for x in sentiment_model.coef_[0])) # number of positive weights, 86785

	#making predictions with logistic regression
	sample_test_data = test_data[10:13]
	print sample_test_data
	print sample_test_data[0]['review']
	print sample_test_data[1]['review']
	sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
	scores = sentiment_model.decision_function(sample_test_matrix)
	print scores # positive scores correspond to positive class prediction, 10 gets the most positive review
	score_pos = sentiment_model.decision_function(test_matrix) # score of positive
	prob = sentiment_model.predict_proba(test_matrix) # probability of negative & positive prediction
	pred = sentiment_model.predict(test_matrix) # prediction
	test_data.add_column(sframe.SArray(prob[:,0]), name='negprob')
	test_data.add_column(sframe.SArray(prob[:,1]), name='posprob')
	test_data.add_column(sframe.SArray(pred), name='predt')
	print test_data.sort('negprob', ascending=False).head(20)['name'] # top 20 negative prediction
	print test_data.sort('posprob', ascending=False).head(20)['name'] # top 20 positive prediction
	print (sum(test_data['sentiment']==test_data['predt'])+0.0)/len(test_data) # prediction accuracy 0.932 for test data

	#train a simpler logistic regression model using only a subet of words
	significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
	'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed',
	'work', 'product', 'money', 'would', 'return']
	vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
	train_matrix_word_subset = vectorizer_word_subset.fit_transform(
	train_data['review_clean'])
	test_matrix_word_subset = vectorizer_word_subset.transform(
	test_data['review_clean'])
	simple_model = LogisticRegression()
	simple_model.fit(train_matrix_word_subset, train_data['sentiment'])
	simple_model_coef_table = sframe.SFrame({'word':significant_words,
	'coefficient':simple_model.coef_.flatten()}) # 10 positive coefficients
	print('check sign of coefficients')
	for i in range(len(significant_words)):
	idx = vectorizer.vocabulary_.keys().index(significant_words[i])
	print sentiment_model.coef_[0, idx]*simple_model.coef_[0, i] # check the sign of coefficients corresponding to the same feature

	#compare the accurancy of sentiment_model vs. simple_model
	predt_train_data_sentiment = sentiment_model.predict(train_matrix)
	train_data.add_column(sframe.SArray(predt_train_data_sentiment), name='predt_sentiment')
	predt_train_data_simple = simple_model.predict(train_matrix_word_subset)
	train_data.add_column(sframe.SArray(predt_train_data_simple), name='predt_simple')
	print('sentiment accuracy for train data')
	print (sum(train_data['sentiment']==train_data['predt_sentiment'])+0.0)/len(train_data) # 0.9685
	print('simple accuracy for train data')
	print (sum(train_data['sentiment']==train_data['predt_simple'])+0.0)/len(train_data) # 0.8668
	predt_test_data_simple = simple_model.predict(test_matrix_word_subset)
	test_data.add_column(sframe.SArray(predt_test_data_simple), name='predt_simple')
	print('simple accuracy for test data')
	print (sum(test_data['sentiment']==test_data['predt_simple'])+0.0)/len(test_data) # 0.86936

	#majority class classifier
	from collections import Counter
	majority_predt = Counter(train_data['sentiment']).most_common(1)[0][0]
	print('accuracy of majority class classifier')
	print (sum(test_data['sentiment']==majority_predt)+0.0)/len(test_data) # 0.84278