Created
December 24, 2016 23:13
-
-
Save shengch02/80ba5a933609cb323fa61f43d6706fb0 to your computer and use it in GitHub Desktop.
(Python) Extract features from Amazon product reviews. Convert an dataframe into a NumPy array. Write a function to compute the derivative of log likelihood function with an L2 penalty with respect to a single coefficient. Implement gradient ascent with an L2 penalty. Empirically explore how the L2 penalty can ameliorate overfitting.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Logistic Regression with L2 regularization | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
print sum(products['sentiment']==1) #num of positive sentiment 26579 | |
print sum(products['sentiment']==-1) #num of negative sentiment 26493 | |
#load 193 most frequent words from a JSON file | |
import json | |
with open('important_words.json') as json_data: | |
important_words = json.load(json_data) | |
important_words = [str(x) for x in important_words] | |
#fill n/a values | |
products['review'] = products['review'].fillna('') | |
#remove punctuations | |
def remove_punctuation(text): | |
import string | |
return text.translate(None, string.punctuation) | |
products['review_clean'] = products['review'].apply(remove_punctuation) | |
#count the occurance of the words | |
for word in important_words: | |
products[word] = products['review_clean'].apply(lambda s : s.split().count(word)) | |
#train_validation split | |
train_data, validation_data = products.random_split(0.8, seed=2) | |
train_data = sframe.SFrame.to_dataframe(train_data) | |
validation_data = sframe.SFrame.to_dataframe(validation_data) | |
#convert the dataframe to a multi-dimensional array | |
def get_numpy_data(dataframe, features, label): | |
dataframe['constant'] = 1 | |
features = ['constant']+features | |
features_frame = dataframe[features] | |
feature_matrix = features_frame.as_matrix() | |
label_sarray = dataframe[label] | |
label_array = label_sarray.as_matrix() | |
return(feature_matrix, label_array) | |
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment') | |
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment') | |
#194 features(including intercept) | |
#estimate conditional probability with link function with no L2 penalty assignment | |
def predict_probability(feature_matrix, coefficients): | |
#dot product of feature and coefficients | |
score = np.dot(feature_matrix, coefficients) | |
#compute probability using the link function | |
predictions = 1.0/(1.0+np.exp(-score)) | |
return predictions | |
#compute derivative of log likelihood with L2 with respect to a single coefficient | |
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): | |
derivative = np.dot(errors, feature) | |
if not feature_is_constant: | |
derivative = derivative-2.0*l2_penalty*coefficient | |
return derivative | |
#compute log-likelihood with L2 penalty | |
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty): | |
indicator = (sentiment==+1) | |
scores = np.dot(feature_matrix, coefficients) | |
lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores))) - \ | |
l2_penalty*np.sum(coefficients[1:]**2) | |
return lp | |
#take gradient steps | |
from math import sqrt | |
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty,max_iter): | |
coefficients = np.array(initial_coefficients) | |
for itr in xrange(max_iter): | |
predictions = predict_probability(feature_matrix, coefficients) | |
indicator = (sentiment==+1) | |
error = indicator-predictions | |
for j in xrange(len(coefficients)): | |
is_intercept = (j==0) | |
derivative = feature_derivative_with_L2(error, feature_matrix[:,j], | |
coefficients[j], l2_penalty, is_intercept) | |
coefficients[j] = coefficients[j]+step_size*derivative | |
if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\ | |
or (itr<=10000 and itr%1000==0) or itr%10000==0: | |
lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty) | |
print 'iteration %*d: log likelihood of observed labels = %.8f' %\ | |
(int(np.ceil(np.log10(max_iter))), itr, lp) | |
return coefficients | |
initial_coefficients = np.zeros(194) | |
step_size = 5e-6 | |
max_iter = 501 | |
feature_matrix = feature_matrix_train | |
sentiment = sentiment_train | |
#create a table for features and learned coefficients | |
table = pd.DataFrame({'word':important_words}) | |
for l2_penalty in [0, 4, 10, 1e2, 1e3, 1e5]: | |
coefficients = logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients, | |
step_size, l2_penalty, max_iter) | |
coefficients = list(coefficients[1:]) | |
table[str(l2_penalty)]=pd.DataFrame({str(l2_penalty):coefficients}) | |
#5 most positive & negative words | |
word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, table['0'])] | |
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True) | |
coefficients_0_penalty = [x[0] for x in word_coefficient_tuples] | |
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False) | |
coefficients_0_penalty_neg = [x[0] for x in word_coefficient_tuples] | |
positive_words = coefficients_0_penalty[0:5] | |
negative_words = coefficients_0_penalty_neg[0:5] | |
#observe the effect of increasing L2 penalty on the 10 words selected | |
import matplotlib.pyplot as plt | |
#matplotlib inline | |
plt.rcParams['figure.figsize'] = 10, 6 | |
def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list): | |
cmap_positive = plt.get_cmap('Reds') | |
cmap_negative = plt.get_cmap('Blues') | |
xx = l2_penalty_list | |
plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k') | |
table_positive_words = table[table['word'].isin(positive_words)] | |
table_negative_words = table[table['word'].isin(negative_words)] | |
del table_positive_words['word'] | |
del table_negative_words['word'] | |
for i in xrange(len(positive_words)): | |
color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15)) | |
plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(), | |
'-', label=positive_words[i], linewidth=4.0, color=color) | |
for i in xrange(len(negative_words)): | |
color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15)) | |
plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(), | |
'-', label=negative_words[i], linewidth=4.0, color=color) | |
plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5) | |
plt.axis([1, 1e5, -1, 2]) | |
plt.title('Coefficient path') | |
plt.xlabel('L2 penalty ($\lambda$)') | |
plt.ylabel('Coefficient value') | |
plt.xscale('log') | |
plt.rcParams.update({'font.size':18}) | |
plt.show() | |
make_coefficient_plot(table, positive_words, negative_words, [0, 4, 10, 1e2, 1e3, 1e5]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment