Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created December 24, 2016 23:13
Show Gist options
  • Save shengch02/80ba5a933609cb323fa61f43d6706fb0 to your computer and use it in GitHub Desktop.
Save shengch02/80ba5a933609cb323fa61f43d6706fb0 to your computer and use it in GitHub Desktop.
(Python) Extract features from Amazon product reviews. Convert an dataframe into a NumPy array. Write a function to compute the derivative of log likelihood function with an L2 penalty with respect to a single coefficient. Implement gradient ascent with an L2 penalty. Empirically explore how the L2 penalty can ameliorate overfitting.
#Logistic Regression with L2 regularization
import math
import pandas as pd
import numpy as np
#the dataset consists a subset of baby product reviews on Amazon.com
import sframe
products = sframe.SFrame('amazon_baby_subset.gl/')
print sum(products['sentiment']==1) #num of positive sentiment 26579
print sum(products['sentiment']==-1) #num of negative sentiment 26493
#load 193 most frequent words from a JSON file
import json
with open('important_words.json') as json_data:
important_words = json.load(json_data)
important_words = [str(x) for x in important_words]
#fill n/a values
products['review'] = products['review'].fillna('')
#remove punctuations
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
#count the occurance of the words
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
#train_validation split
train_data, validation_data = products.random_split(0.8, seed=2)
train_data = sframe.SFrame.to_dataframe(train_data)
validation_data = sframe.SFrame.to_dataframe(validation_data)
#convert the dataframe to a multi-dimensional array
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant']+features
features_frame = dataframe[features]
feature_matrix = features_frame.as_matrix()
label_sarray = dataframe[label]
label_array = label_sarray.as_matrix()
return(feature_matrix, label_array)
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
#194 features(including intercept)
#estimate conditional probability with link function with no L2 penalty assignment
def predict_probability(feature_matrix, coefficients):
#dot product of feature and coefficients
score = np.dot(feature_matrix, coefficients)
#compute probability using the link function
predictions = 1.0/(1.0+np.exp(-score))
return predictions
#compute derivative of log likelihood with L2 with respect to a single coefficient
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
derivative = np.dot(errors, feature)
if not feature_is_constant:
derivative = derivative-2.0*l2_penalty*coefficient
return derivative
#compute log-likelihood with L2 penalty
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
indicator = (sentiment==+1)
scores = np.dot(feature_matrix, coefficients)
lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores))) - \
l2_penalty*np.sum(coefficients[1:]**2)
return lp
#take gradient steps
from math import sqrt
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty,max_iter):
coefficients = np.array(initial_coefficients)
for itr in xrange(max_iter):
predictions = predict_probability(feature_matrix, coefficients)
indicator = (sentiment==+1)
error = indicator-predictions
for j in xrange(len(coefficients)):
is_intercept = (j==0)
derivative = feature_derivative_with_L2(error, feature_matrix[:,j],
coefficients[j], l2_penalty, is_intercept)
coefficients[j] = coefficients[j]+step_size*derivative
if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
or (itr<=10000 and itr%1000==0) or itr%10000==0:
lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
print 'iteration %*d: log likelihood of observed labels = %.8f' %\
(int(np.ceil(np.log10(max_iter))), itr, lp)
return coefficients
initial_coefficients = np.zeros(194)
step_size = 5e-6
max_iter = 501
feature_matrix = feature_matrix_train
sentiment = sentiment_train
#create a table for features and learned coefficients
table = pd.DataFrame({'word':important_words})
for l2_penalty in [0, 4, 10, 1e2, 1e3, 1e5]:
coefficients = logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,
step_size, l2_penalty, max_iter)
coefficients = list(coefficients[1:])
table[str(l2_penalty)]=pd.DataFrame({str(l2_penalty):coefficients})
#5 most positive & negative words
word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, table['0'])]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
coefficients_0_penalty = [x[0] for x in word_coefficient_tuples]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)
coefficients_0_penalty_neg = [x[0] for x in word_coefficient_tuples]
positive_words = coefficients_0_penalty[0:5]
negative_words = coefficients_0_penalty_neg[0:5]
#observe the effect of increasing L2 penalty on the 10 words selected
import matplotlib.pyplot as plt
#matplotlib inline
plt.rcParams['figure.figsize'] = 10, 6
def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
cmap_positive = plt.get_cmap('Reds')
cmap_negative = plt.get_cmap('Blues')
xx = l2_penalty_list
plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
table_positive_words = table[table['word'].isin(positive_words)]
table_negative_words = table[table['word'].isin(negative_words)]
del table_positive_words['word']
del table_negative_words['word']
for i in xrange(len(positive_words)):
color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15))
plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(),
'-', label=positive_words[i], linewidth=4.0, color=color)
for i in xrange(len(negative_words)):
color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15))
plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(),
'-', label=negative_words[i], linewidth=4.0, color=color)
plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
plt.axis([1, 1e5, -1, 2])
plt.title('Coefficient path')
plt.xlabel('L2 penalty ($\lambda$)')
plt.ylabel('Coefficient value')
plt.xscale('log')
plt.rcParams.update({'font.size':18})
plt.show()
make_coefficient_plot(table, positive_words, negative_words, [0, 4, 10, 1e2, 1e3, 1e5])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment