Skip to content

Instantly share code, notes, and snippets.

@shengch02
Last active December 23, 2016 20:15
Show Gist options
  • Save shengch02/2523d150c977400b8528d60c9299bc2c to your computer and use it in GitHub Desktop.
Save shengch02/2523d150c977400b8528d60c9299bc2c to your computer and use it in GitHub Desktop.
(Python) Extract features from Amazon product reviews. Convert an SFrame into a NumPy array. Implement the link function for logistic regression. Write a function to compute the derivative of the log likelihood function with respect to a single coefficient. Implement gradient ascent. Given a set of coefficients, predict sentiments. Compute class…
#implement logistic regression from scratch
import math
import pandas as pd
import numpy as np
#the dataset consists a subset of baby product reviews on Amazon.com
import sframe
products = sframe.SFrame('amazon_baby_subset.gl/')
products = sframe.SFrame.to_dataframe(products)
print sum(products['sentiment']==1) #num of positive sentiment 26579
print sum(products['sentiment']==-1) #num of negative sentiment 26493
#load 193 most frequent words from a JSON file
import json
with open('important_words.json') as json_data:
important_words = json.load(json_data)
important_words = [str(x) for x in important_words]
#fill n/a values
products['review'] = products['review'].fillna('')
#remove punctuations
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
#count the occurance of the words
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
#compute the number of produc reviews that contain the word 'perfect'
print sum(products['perfect']>0) #2955
#convert the data frame to a multi-dimensional array
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant']+features
features_frame = dataframe[features]
feature_matrix = features_frame.as_matrix()
label_sarray = dataframe[label]
label_array = label_sarray.as_matrix()
return(feature_matrix, label_array)
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
#194 features(including intercept)
#estimate conditional probability with link function
def predict_probability(feature_matrix, coefficients):
#dot product of feature and coefficients
score = np.dot(feature_matrix, coefficients)
#compute probability using the link function
predictions = 1.0/(1.0+np.exp(-score))
return predictions
#compute derivative of log likelihood with respect to a single coefficient
def feature_derivative(errors, feature):
derivative = np.dot(errors, feature)
return derivative
#compute log-likelihood
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
indicator = (sentiment==+1)
scores = np.dot(feature_matrix, coefficients)
lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores)))
return lp
#take gradient steps
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
coefficients = np.array(initial_coefficients)
for itr in xrange(max_iter):
predictions = predict_probability(feature_matrix, coefficients)
indicator = (sentiment==+1)
error = indicator-predictions
for j in xrange(len(coefficients)):
derivative = feature_derivative(error, feature_matrix[:,j])
coefficients[j] = coefficients[j]+step_size*derivative
if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
or (itr<=10000 and itr%1000==0) or itr%10000==0:
lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
print 'iteration %*d: log likelihood of observed labels = %.8f' %\
(int(np.ceil(np.log10(max_iter))), itr, lp)
return coefficients
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_iter = 301
#log likelihood increase
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
#predict sentiments
sentiment_predict = np.dot(feature_matrix, coefficients)
sum(sentiment_predict>0) #25126 positive
predict = np.sign(sentiment_predict)
#measure accuracy
print (sum(sentiment==predict)+0.0)/len(sentiment) # 0.7519
#find the most positive words
coefficients = list(coefficients[1:])
word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
print word_coefficient_tuples[0:10]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment