Last active
December 23, 2016 20:15
-
-
Save shengch02/2523d150c977400b8528d60c9299bc2c to your computer and use it in GitHub Desktop.
(Python) Extract features from Amazon product reviews. Convert an SFrame into a NumPy array. Implement the link function for logistic regression. Write a function to compute the derivative of the log likelihood function with respect to a single coefficient. Implement gradient ascent. Given a set of coefficients, predict sentiments. Compute class…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#implement logistic regression from scratch | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists a subset of baby product reviews on Amazon.com | |
import sframe | |
products = sframe.SFrame('amazon_baby_subset.gl/') | |
products = sframe.SFrame.to_dataframe(products) | |
print sum(products['sentiment']==1) #num of positive sentiment 26579 | |
print sum(products['sentiment']==-1) #num of negative sentiment 26493 | |
#load 193 most frequent words from a JSON file | |
import json | |
with open('important_words.json') as json_data: | |
important_words = json.load(json_data) | |
important_words = [str(x) for x in important_words] | |
#fill n/a values | |
products['review'] = products['review'].fillna('') | |
#remove punctuations | |
def remove_punctuation(text): | |
import string | |
return text.translate(None, string.punctuation) | |
products['review_clean'] = products['review'].apply(remove_punctuation) | |
#count the occurance of the words | |
for word in important_words: | |
products[word] = products['review_clean'].apply(lambda s : s.split().count(word)) | |
#compute the number of produc reviews that contain the word 'perfect' | |
print sum(products['perfect']>0) #2955 | |
#convert the data frame to a multi-dimensional array | |
def get_numpy_data(dataframe, features, label): | |
dataframe['constant'] = 1 | |
features = ['constant']+features | |
features_frame = dataframe[features] | |
feature_matrix = features_frame.as_matrix() | |
label_sarray = dataframe[label] | |
label_array = label_sarray.as_matrix() | |
return(feature_matrix, label_array) | |
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') | |
#194 features(including intercept) | |
#estimate conditional probability with link function | |
def predict_probability(feature_matrix, coefficients): | |
#dot product of feature and coefficients | |
score = np.dot(feature_matrix, coefficients) | |
#compute probability using the link function | |
predictions = 1.0/(1.0+np.exp(-score)) | |
return predictions | |
#compute derivative of log likelihood with respect to a single coefficient | |
def feature_derivative(errors, feature): | |
derivative = np.dot(errors, feature) | |
return derivative | |
#compute log-likelihood | |
def compute_log_likelihood(feature_matrix, sentiment, coefficients): | |
indicator = (sentiment==+1) | |
scores = np.dot(feature_matrix, coefficients) | |
lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores))) | |
return lp | |
#take gradient steps | |
from math import sqrt | |
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter): | |
coefficients = np.array(initial_coefficients) | |
for itr in xrange(max_iter): | |
predictions = predict_probability(feature_matrix, coefficients) | |
indicator = (sentiment==+1) | |
error = indicator-predictions | |
for j in xrange(len(coefficients)): | |
derivative = feature_derivative(error, feature_matrix[:,j]) | |
coefficients[j] = coefficients[j]+step_size*derivative | |
if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\ | |
or (itr<=10000 and itr%1000==0) or itr%10000==0: | |
lp = compute_log_likelihood(feature_matrix, sentiment, coefficients) | |
print 'iteration %*d: log likelihood of observed labels = %.8f' %\ | |
(int(np.ceil(np.log10(max_iter))), itr, lp) | |
return coefficients | |
initial_coefficients = np.zeros(194) | |
step_size = 1e-7 | |
max_iter = 301 | |
#log likelihood increase | |
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter) | |
#predict sentiments | |
sentiment_predict = np.dot(feature_matrix, coefficients) | |
sum(sentiment_predict>0) #25126 positive | |
predict = np.sign(sentiment_predict) | |
#measure accuracy | |
print (sum(sentiment==predict)+0.0)/len(sentiment) # 0.7519 | |
#find the most positive words | |
coefficients = list(coefficients[1:]) | |
word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, coefficients)] | |
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True) | |
print word_coefficient_tuples[0:10] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment