Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save shengch02/6edfa765276c3731be29b7c0f83c61af to your computer and use it in GitHub Desktop.
Save shengch02/6edfa765276c3731be29b7c0f83c61af to your computer and use it in GitHub Desktop.
(Python) Use SFrames to do some feature engineering Train a logistic regression model to predict the sentiment of product reviews. Inspect the weights (coefficients) of a trained logistic regression model. Make a prediction (both class and probability) of sentiment for a new product review. Given the logistic regression weights, predictors and g…
#the dataset consists of baby product reviews on Amazon.com
#link for data: https://d18ky98rnyall9.cloudfront.net/_35bdebdff61378878ea2247780005e52_amazon_baby.gl.zip?Expires=1482278400&Signature=blPJv6YQNFgcZh~dULuDECzZlA6eGL1x9lzQKzHknqVHSdudmfjq0XPaokFjv-~Qy8nGADiBBdx4ar0BWgeboW1eTkYHOZzoUIMBfSPQGqA4Q9H8X8vwFyr9R-TC0LE4h4CsTRFH56BtbqpKtjKeJKxVv5E5LfZZiyhZEr6We5M_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A
import sframe
products = sframe.SFrame('amazon_baby.gl/')
#clean the original data: remove punctuation, fill in N/A, remove neutral sentiment,
# perform a train/test split, produce word count matrix
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean']=products['review'].apply(remove_punctuation)
products = products.fillna('review','')
products = products[products['rating'] != 3]
products['sentiment']=products['rating'].apply(lambda rating : +1
if rating > 3 else -1)
train_data, test_data = products.random_split(0.8, seed=1)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print train_matrix
#scikit learn, LogisticRegression(), training
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])
print(len(sentiment_model.coef_[0])) # number of coefficients/weights, 121712
print(sum(x >=0 for x in sentiment_model.coef_[0])) # number of positive weights, 86785
#making predictions with logistic regression
sample_test_data = test_data[10:13]
print sample_test_data
print sample_test_data[0]['review']
print sample_test_data[1]['review']
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores # positive scores correspond to positive class prediction, 10 gets the most positive review
score_pos = sentiment_model.decision_function(test_matrix) # score of positive
prob = sentiment_model.predict_proba(test_matrix) # probability of negative & positive prediction
pred = sentiment_model.predict(test_matrix) # prediction
test_data.add_column(sframe.SArray(prob[:,0]), name='negprob')
test_data.add_column(sframe.SArray(prob[:,1]), name='posprob')
test_data.add_column(sframe.SArray(pred), name='predt')
print test_data.sort('negprob', ascending=False).head(20)['name'] # top 20 negative prediction
print test_data.sort('posprob', ascending=False).head(20)['name'] # top 20 positive prediction
print (sum(test_data['sentiment']==test_data['predt'])+0.0)/len(test_data) # prediction accuracy 0.932 for test data
#train a simpler logistic regression model using only a subet of words
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed',
'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
train_matrix_word_subset = vectorizer_word_subset.fit_transform(
train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(
test_data['review_clean'])
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])
simple_model_coef_table = sframe.SFrame({'word':significant_words,
'coefficient':simple_model.coef_.flatten()}) # 10 positive coefficients
print('check sign of coefficients')
for i in range(len(significant_words)):
idx = vectorizer.vocabulary_.keys().index(significant_words[i])
print sentiment_model.coef_[0, idx]*simple_model.coef_[0, i] # check the sign of coefficients corresponding to the same feature
#compare the accurancy of sentiment_model vs. simple_model
predt_train_data_sentiment = sentiment_model.predict(train_matrix)
train_data.add_column(sframe.SArray(predt_train_data_sentiment), name='predt_sentiment')
predt_train_data_simple = simple_model.predict(train_matrix_word_subset)
train_data.add_column(sframe.SArray(predt_train_data_simple), name='predt_simple')
print('sentiment accuracy for train data')
print (sum(train_data['sentiment']==train_data['predt_sentiment'])+0.0)/len(train_data) # 0.9685
print('simple accuracy for train data')
print (sum(train_data['sentiment']==train_data['predt_simple'])+0.0)/len(train_data) # 0.8668
predt_test_data_simple = simple_model.predict(test_matrix_word_subset)
test_data.add_column(sframe.SArray(predt_test_data_simple), name='predt_simple')
print('simple accuracy for test data')
print (sum(test_data['sentiment']==test_data['predt_simple'])+0.0)/len(test_data) # 0.86936
#majority class classifier
from collections import Counter
majority_predt = Counter(train_data['sentiment']).most_common(1)[0][0]
print('accuracy of majority class classifier')
print (sum(test_data['sentiment']==majority_predt)+0.0)/len(test_data) # 0.84278
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment