shengch02 · December 24, 2016 23:13
diff --git a/Logistic Regression with L2 regularization b/Logistic Regression with L2 regularization
 #Logistic Regression with L2 regularization
 import math
 import pandas as pd
 import numpy as np

 #the dataset consists a subset of baby product reviews on Amazon.com
 import sframe
 products = sframe.SFrame('amazon_baby_subset.gl/')
 print sum(products['sentiment']==1)  #num of positive sentiment  26579
 print sum(products['sentiment']==-1) #num of negative sentiment  26493

 #load 193 most frequent words from a JSON file
 import json
 with open('important_words.json') as json_data:
 	important_words = json.load(json_data)
 important_words = [str(x) for x in important_words]
 #fill n/a values
 products['review'] = products['review'].fillna('')

 #remove punctuations
 def remove_punctuation(text):
 	import string
 	return text.translate(None, string.punctuation)
 products['review_clean'] = products['review'].apply(remove_punctuation)

 #count the occurance of the words
 for word in important_words:
 	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
  
 #train_validation split
 train_data, validation_data = products.random_split(0.8, seed=2)
 train_data = sframe.SFrame.to_dataframe(train_data)
 validation_data = sframe.SFrame.to_dataframe(validation_data)

 #convert the dataframe to a multi-dimensional array
 def get_numpy_data(dataframe, features, label):
 	dataframe['constant'] = 1
 	features = ['constant']+features
 	features_frame = dataframe[features]
 	feature_matrix = features_frame.as_matrix()
 	label_sarray = dataframe[label]
 	label_array = label_sarray.as_matrix()
 	return(feature_matrix, label_array)
 feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment') 
 feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
 	#194 features(including intercept)
  
 #estimate conditional probability with link function with no L2 penalty assignment
 def predict_probability(feature_matrix, coefficients):
 	#dot product of feature and coefficients
 	score = np.dot(feature_matrix, coefficients)
 	#compute probability using the link function
 	predictions = 1.0/(1.0+np.exp(-score)) 
 	return predictions
  
 #compute derivative of log likelihood with L2 with respect to a single coefficient
 def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
 	derivative = np.dot(errors, feature)
 	if not feature_is_constant:
 		derivative = derivative-2.0*l2_penalty*coefficient
 	return derivative
  
 #compute log-likelihood with L2 penalty
 def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
 	indicator = (sentiment==+1)
 	scores = np.dot(feature_matrix, coefficients)
 	lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores))) - \
 	  l2_penalty*np.sum(coefficients[1:]**2)
 	return lp 
  
 #take gradient steps
 from math import sqrt
 def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty,max_iter):
 	coefficients = np.array(initial_coefficients)
 	for itr in xrange(max_iter):
 		predictions = predict_probability(feature_matrix, coefficients)
 		indicator = (sentiment==+1)
 		error = indicator-predictions
 		for j in xrange(len(coefficients)):
 			is_intercept = (j==0)
 			derivative = feature_derivative_with_L2(error, feature_matrix[:,j], 
 				coefficients[j], l2_penalty, is_intercept)
 			coefficients[j] = coefficients[j]+step_size*derivative
 		if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
 		or (itr<=10000 and itr%1000==0) or itr%10000==0:
 			lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
 			print 'iteration %*d: log likelihood of observed labels = %.8f' %\
 			(int(np.ceil(np.log10(max_iter))), itr, lp)
 	return coefficients
 initial_coefficients = np.zeros(194)
 step_size = 5e-6
 max_iter = 501
 feature_matrix = feature_matrix_train
 sentiment = sentiment_train

 #create a table for features and learned coefficients
 table = pd.DataFrame({'word':important_words})
 for l2_penalty in [0, 4, 10, 1e2, 1e3, 1e5]:
 	coefficients = logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients, 
 	    step_size, l2_penalty, max_iter)
 	coefficients =  list(coefficients[1:])
 	table[str(l2_penalty)]=pd.DataFrame({str(l2_penalty):coefficients})

 #5 most positive & negative words
 word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, table['0'])]
 word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
 coefficients_0_penalty = [x[0] for x in word_coefficient_tuples] 
 word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)
 coefficients_0_penalty_neg = [x[0] for x in word_coefficient_tuples] 
 positive_words = coefficients_0_penalty[0:5]
 negative_words = coefficients_0_penalty_neg[0:5]
 #observe the effect of increasing L2 penalty on the 10 words selected
 import matplotlib.pyplot as plt
 #matplotlib inline
 plt.rcParams['figure.figsize'] = 10, 6
 def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
 	cmap_positive = plt.get_cmap('Reds')
 	cmap_negative = plt.get_cmap('Blues')
 	xx = l2_penalty_list
 	plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
 	table_positive_words = table[table['word'].isin(positive_words)]
 	table_negative_words = table[table['word'].isin(negative_words)]
 	del table_positive_words['word']
 	del table_negative_words['word']
 	for i in xrange(len(positive_words)):
 		color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15))
 		plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(),
 			'-', label=positive_words[i], linewidth=4.0, color=color)
 	for i in xrange(len(negative_words)):
 		color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15))
 		plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(),
 			'-', label=negative_words[i], linewidth=4.0, color=color)
 	plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
 	plt.axis([1, 1e5, -1, 2])
 	plt.title('Coefficient path')
 	plt.xlabel('L2 penalty ($\lambda$)')
 	plt.ylabel('Coefficient value')
 	plt.xscale('log')
 	plt.rcParams.update({'font.size':18})
 	plt.show()
 make_coefficient_plot(table, positive_words, negative_words, [0, 4, 10, 1e2, 1e3, 1e5])
	#Logistic Regression with L2 regularization
	import math
	import pandas as pd
	import numpy as np

	#the dataset consists a subset of baby product reviews on Amazon.com
	import sframe
	products = sframe.SFrame('amazon_baby_subset.gl/')
	print sum(products['sentiment']==1) #num of positive sentiment 26579
	print sum(products['sentiment']==-1) #num of negative sentiment 26493

	#load 193 most frequent words from a JSON file
	import json
	with open('important_words.json') as json_data:
	important_words = json.load(json_data)
	important_words = [str(x) for x in important_words]
	#fill n/a values
	products['review'] = products['review'].fillna('')

	#remove punctuations
	def remove_punctuation(text):
	import string
	return text.translate(None, string.punctuation)
	products['review_clean'] = products['review'].apply(remove_punctuation)

	#count the occurance of the words
	for word in important_words:
	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

	#train_validation split
	train_data, validation_data = products.random_split(0.8, seed=2)
	train_data = sframe.SFrame.to_dataframe(train_data)
	validation_data = sframe.SFrame.to_dataframe(validation_data)

	#convert the dataframe to a multi-dimensional array
	def get_numpy_data(dataframe, features, label):
	dataframe['constant'] = 1
	features = ['constant']+features
	features_frame = dataframe[features]
	feature_matrix = features_frame.as_matrix()
	label_sarray = dataframe[label]
	label_array = label_sarray.as_matrix()
	return(feature_matrix, label_array)
	feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
	feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
	#194 features(including intercept)

	#estimate conditional probability with link function with no L2 penalty assignment
	def predict_probability(feature_matrix, coefficients):
	#dot product of feature and coefficients
	score = np.dot(feature_matrix, coefficients)
	#compute probability using the link function
	predictions = 1.0/(1.0+np.exp(-score))
	return predictions

	#compute derivative of log likelihood with L2 with respect to a single coefficient
	def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
	derivative = np.dot(errors, feature)
	if not feature_is_constant:
	derivative = derivative-2.0l2_penaltycoefficient
	return derivative

	#compute log-likelihood with L2 penalty
	def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
	indicator = (sentiment==+1)
	scores = np.dot(feature_matrix, coefficients)
	lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores))) - \
	l2_penaltynp.sum(coefficients[1:]*2)
	return lp

	#take gradient steps
	from math import sqrt
	def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty,max_iter):
	coefficients = np.array(initial_coefficients)
	for itr in xrange(max_iter):
	predictions = predict_probability(feature_matrix, coefficients)
	indicator = (sentiment==+1)
	error = indicator-predictions
	for j in xrange(len(coefficients)):
	is_intercept = (j==0)
	derivative = feature_derivative_with_L2(error, feature_matrix[:,j],
	coefficients[j], l2_penalty, is_intercept)
	coefficients[j] = coefficients[j]+step_size*derivative
	if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
	or (itr<=10000 and itr%1000==0) or itr%10000==0:
	lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
	print 'iteration %*d: log likelihood of observed labels = %.8f' %\
	(int(np.ceil(np.log10(max_iter))), itr, lp)
	return coefficients
	initial_coefficients = np.zeros(194)
	step_size = 5e-6
	max_iter = 501
	feature_matrix = feature_matrix_train
	sentiment = sentiment_train

	#create a table for features and learned coefficients
	table = pd.DataFrame({'word':important_words})
	for l2_penalty in [0, 4, 10, 1e2, 1e3, 1e5]:
	coefficients = logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,
	step_size, l2_penalty, max_iter)
	coefficients = list(coefficients[1:])
	table[str(l2_penalty)]=pd.DataFrame({str(l2_penalty):coefficients})

	#5 most positive & negative words
	word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, table['0'])]
	word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
	coefficients_0_penalty = [x[0] for x in word_coefficient_tuples]
	word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)
	coefficients_0_penalty_neg = [x[0] for x in word_coefficient_tuples]
	positive_words = coefficients_0_penalty[0:5]
	negative_words = coefficients_0_penalty_neg[0:5]
	#observe the effect of increasing L2 penalty on the 10 words selected
	import matplotlib.pyplot as plt
	#matplotlib inline
	plt.rcParams['figure.figsize'] = 10, 6
	def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
	cmap_positive = plt.get_cmap('Reds')
	cmap_negative = plt.get_cmap('Blues')
	xx = l2_penalty_list
	plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
	table_positive_words = table[table['word'].isin(positive_words)]
	table_negative_words = table[table['word'].isin(negative_words)]
	del table_positive_words['word']
	del table_negative_words['word']
	for i in xrange(len(positive_words)):
	color = cmap_positive(0.8((i+1)/(len(positive_words)1.2)+0.15))
	plt.plot(xx, table_positive_words[i:i+1].as_matrix().flatten(),
	'-', label=positive_words[i], linewidth=4.0, color=color)
	for i in xrange(len(negative_words)):
	color = cmap_negative(0.8((i+1)/(len(negative_words)1.2)+0.15))
	plt.plot(xx, table_negative_words[i:i+1].as_matrix().flatten(),
	'-', label=negative_words[i], linewidth=4.0, color=color)
	plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
	plt.axis([1, 1e5, -1, 2])
	plt.title('Coefficient path')
	plt.xlabel('L2 penalty ($\lambda$)')
	plt.ylabel('Coefficient value')
	plt.xscale('log')
	plt.rcParams.update({'font.size':18})
	plt.show()
	make_coefficient_plot(table, positive_words, negative_words, [0, 4, 10, 1e2, 1e3, 1e5])