Last active
December 23, 2016 20:17
-
-
Save shengch02/6edfa765276c3731be29b7c0f83c61af to your computer and use it in GitHub Desktop.
(Python) Use SFrames to do some feature engineering Train a logistic regression model to predict the sentiment of product reviews. Inspect the weights (coefficients) of a trained logistic regression model. Make a prediction (both class and probability) of sentiment for a new product review. Given the logistic regression weights, predictors and g…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#the dataset consists of baby product reviews on Amazon.com | |
#link for data: https://d18ky98rnyall9.cloudfront.net/_35bdebdff61378878ea2247780005e52_amazon_baby.gl.zip?Expires=1482278400&Signature=blPJv6YQNFgcZh~dULuDECzZlA6eGL1x9lzQKzHknqVHSdudmfjq0XPaokFjv-~Qy8nGADiBBdx4ar0BWgeboW1eTkYHOZzoUIMBfSPQGqA4Q9H8X8vwFyr9R-TC0LE4h4CsTRFH56BtbqpKtjKeJKxVv5E5LfZZiyhZEr6We5M_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A | |
import sframe | |
products = sframe.SFrame('amazon_baby.gl/') | |
#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment, | |
# perform a train/test split, produce word count matrix | |
def remove_punctuation(text): | |
import string | |
return text.translate(None, string.punctuation) | |
products['review_clean']=products['review'].apply(remove_punctuation) | |
products = products.fillna('review','') | |
products = products[products['rating'] != 3] | |
products['sentiment']=products['rating'].apply(lambda rating : +1 | |
if rating > 3 else -1) | |
train_data, test_data = products.random_split(0.8, seed=1) | |
from sklearn.feature_extraction.text import CountVectorizer | |
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') | |
train_matrix = vectorizer.fit_transform(train_data['review_clean']) | |
test_matrix = vectorizer.transform(test_data['review_clean']) | |
print train_matrix | |
#scikit learn, LogisticRegression(), training | |
from sklearn.linear_model import LogisticRegression | |
sentiment_model = LogisticRegression() | |
sentiment_model.fit(train_matrix, train_data['sentiment']) | |
print(len(sentiment_model.coef_[0])) # number of coefficients/weights, 121712 | |
print(sum(x >=0 for x in sentiment_model.coef_[0])) # number of positive weights, 86785 | |
#making predictions with logistic regression | |
sample_test_data = test_data[10:13] | |
print sample_test_data | |
print sample_test_data[0]['review'] | |
print sample_test_data[1]['review'] | |
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean']) | |
scores = sentiment_model.decision_function(sample_test_matrix) | |
print scores # positive scores correspond to positive class prediction, 10 gets the most positive review | |
score_pos = sentiment_model.decision_function(test_matrix) # score of positive | |
prob = sentiment_model.predict_proba(test_matrix) # probability of negative & positive prediction | |
pred = sentiment_model.predict(test_matrix) # prediction | |
test_data.add_column(sframe.SArray(prob[:,0]), name='negprob') | |
test_data.add_column(sframe.SArray(prob[:,1]), name='posprob') | |
test_data.add_column(sframe.SArray(pred), name='predt') | |
print test_data.sort('negprob', ascending=False).head(20)['name'] # top 20 negative prediction | |
print test_data.sort('posprob', ascending=False).head(20)['name'] # top 20 positive prediction | |
print (sum(test_data['sentiment']==test_data['predt'])+0.0)/len(test_data) # prediction accuracy 0.932 for test data | |
#train a simpler logistic regression model using only a subet of words | |
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', | |
'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', | |
'work', 'product', 'money', 'would', 'return'] | |
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) | |
train_matrix_word_subset = vectorizer_word_subset.fit_transform( | |
train_data['review_clean']) | |
test_matrix_word_subset = vectorizer_word_subset.transform( | |
test_data['review_clean']) | |
simple_model = LogisticRegression() | |
simple_model.fit(train_matrix_word_subset, train_data['sentiment']) | |
simple_model_coef_table = sframe.SFrame({'word':significant_words, | |
'coefficient':simple_model.coef_.flatten()}) # 10 positive coefficients | |
print('check sign of coefficients') | |
for i in range(len(significant_words)): | |
idx = vectorizer.vocabulary_.keys().index(significant_words[i]) | |
print sentiment_model.coef_[0, idx]*simple_model.coef_[0, i] # check the sign of coefficients corresponding to the same feature | |
#compare the accurancy of sentiment_model vs. simple_model | |
predt_train_data_sentiment = sentiment_model.predict(train_matrix) | |
train_data.add_column(sframe.SArray(predt_train_data_sentiment), name='predt_sentiment') | |
predt_train_data_simple = simple_model.predict(train_matrix_word_subset) | |
train_data.add_column(sframe.SArray(predt_train_data_simple), name='predt_simple') | |
print('sentiment accuracy for train data') | |
print (sum(train_data['sentiment']==train_data['predt_sentiment'])+0.0)/len(train_data) # 0.9685 | |
print('simple accuracy for train data') | |
print (sum(train_data['sentiment']==train_data['predt_simple'])+0.0)/len(train_data) # 0.8668 | |
predt_test_data_simple = simple_model.predict(test_matrix_word_subset) | |
test_data.add_column(sframe.SArray(predt_test_data_simple), name='predt_simple') | |
print('simple accuracy for test data') | |
print (sum(test_data['sentiment']==test_data['predt_simple'])+0.0)/len(test_data) # 0.86936 | |
#majority class classifier | |
from collections import Counter | |
majority_predt = Counter(train_data['sentiment']).most_common(1)[0][0] | |
print('accuracy of majority class classifier') | |
print (sum(test_data['sentiment']==majority_predt)+0.0)/len(test_data) # 0.84278 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment