shengch02 · December 23, 2016 20:15
diff --git a/Implementing logistic regression from scratch b/Implementing logistic regression from scratch
 #implement logistic regression from scratch
 import math
 import pandas as pd
 import numpy as np
 #the dataset consists a subset of baby product reviews on Amazon.com
 import sframe
 products = sframe.SFrame('amazon_baby_subset.gl/')
 products = sframe.SFrame.to_dataframe(products)
 print sum(products['sentiment']==1)  #num of positive sentiment  26579
 print sum(products['sentiment']==-1) #num of negative sentiment  26493
 #load 193 most frequent words from a JSON file
 import json
 with open('important_words.json') as json_data:
 	important_words = json.load(json_data)
 important_words = [str(x) for x in important_words]
 #fill n/a values
 products['review'] = products['review'].fillna('')
 #remove punctuations
 def remove_punctuation(text):
 	import string
 	return text.translate(None, string.punctuation)
 products['review_clean'] = products['review'].apply(remove_punctuation)
 #count the occurance of the words
 for word in important_words:
 	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
 #compute the number of produc reviews that contain the word 'perfect'
 print sum(products['perfect']>0)   #2955
 #convert the data frame to a multi-dimensional array
 def get_numpy_data(dataframe, features, label):
 	dataframe['constant'] = 1
 	features = ['constant']+features
 	features_frame = dataframe[features]
 	feature_matrix = features_frame.as_matrix()
 	label_sarray = dataframe[label]
 	label_array = label_sarray.as_matrix()
 	return(feature_matrix, label_array)
 feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') 
 	#194 features(including intercept)
 #estimate conditional probability with link function
 def predict_probability(feature_matrix, coefficients):
 	#dot product of feature and coefficients
 	score = np.dot(feature_matrix, coefficients)
 	#compute probability using the link function
 	predictions = 1.0/(1.0+np.exp(-score)) 
 	return predictions
 #compute derivative of log likelihood with respect to a single coefficient
 def feature_derivative(errors, feature):
 	derivative = np.dot(errors, feature)
 	return derivative
 #compute log-likelihood
 def compute_log_likelihood(feature_matrix, sentiment, coefficients):
 	indicator = (sentiment==+1)
 	scores = np.dot(feature_matrix, coefficients)
 	lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores)))
 	return lp 
 #take gradient steps
 from math import sqrt
 def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
 	coefficients = np.array(initial_coefficients)
 	for itr in xrange(max_iter):
 		predictions = predict_probability(feature_matrix, coefficients)
 		indicator = (sentiment==+1)
 		error = indicator-predictions
 		for j in xrange(len(coefficients)):
 			derivative = feature_derivative(error, feature_matrix[:,j])
 			coefficients[j] = coefficients[j]+step_size*derivative
 		if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
 		or (itr<=10000 and itr%1000==0) or itr%10000==0:
 			lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
 			print 'iteration %*d: log likelihood of observed labels = %.8f' %\
 			(int(np.ceil(np.log10(max_iter))), itr, lp)
 	return coefficients
 initial_coefficients = np.zeros(194)
 step_size = 1e-7
 max_iter = 301
 #log likelihood increase
 coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
 #predict sentiments
 sentiment_predict = np.dot(feature_matrix, coefficients)
 sum(sentiment_predict>0)	 #25126 positive 
 predict = np.sign(sentiment_predict)
 #measure accuracy
 print (sum(sentiment==predict)+0.0)/len(sentiment)   # 0.7519
 #find the most positive words
 coefficients =  list(coefficients[1:])
 word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
 word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
 print word_coefficient_tuples[0:10]
	#implement logistic regression from scratch
	import math
	import pandas as pd
	import numpy as np
	#the dataset consists a subset of baby product reviews on Amazon.com
	import sframe
	products = sframe.SFrame('amazon_baby_subset.gl/')
	products = sframe.SFrame.to_dataframe(products)
	print sum(products['sentiment']==1) #num of positive sentiment 26579
	print sum(products['sentiment']==-1) #num of negative sentiment 26493
	#load 193 most frequent words from a JSON file
	import json
	with open('important_words.json') as json_data:
	important_words = json.load(json_data)
	important_words = [str(x) for x in important_words]
	#fill n/a values
	products['review'] = products['review'].fillna('')
	#remove punctuations
	def remove_punctuation(text):
	import string
	return text.translate(None, string.punctuation)
	products['review_clean'] = products['review'].apply(remove_punctuation)
	#count the occurance of the words
	for word in important_words:
	products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
	#compute the number of produc reviews that contain the word 'perfect'
	print sum(products['perfect']>0) #2955
	#convert the data frame to a multi-dimensional array
	def get_numpy_data(dataframe, features, label):
	dataframe['constant'] = 1
	features = ['constant']+features
	features_frame = dataframe[features]
	feature_matrix = features_frame.as_matrix()
	label_sarray = dataframe[label]
	label_array = label_sarray.as_matrix()
	return(feature_matrix, label_array)
	feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
	#194 features(including intercept)
	#estimate conditional probability with link function
	def predict_probability(feature_matrix, coefficients):
	#dot product of feature and coefficients
	score = np.dot(feature_matrix, coefficients)
	#compute probability using the link function
	predictions = 1.0/(1.0+np.exp(-score))
	return predictions
	#compute derivative of log likelihood with respect to a single coefficient
	def feature_derivative(errors, feature):
	derivative = np.dot(errors, feature)
	return derivative
	#compute log-likelihood
	def compute_log_likelihood(feature_matrix, sentiment, coefficients):
	indicator = (sentiment==+1)
	scores = np.dot(feature_matrix, coefficients)
	lp = np.sum((indicator-1)*scores-np.log(1.0+np.exp(-scores)))
	return lp
	#take gradient steps
	from math import sqrt
	def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
	coefficients = np.array(initial_coefficients)
	for itr in xrange(max_iter):
	predictions = predict_probability(feature_matrix, coefficients)
	indicator = (sentiment==+1)
	error = indicator-predictions
	for j in xrange(len(coefficients)):
	derivative = feature_derivative(error, feature_matrix[:,j])
	coefficients[j] = coefficients[j]+step_size*derivative
	if itr<=15 or (itr<=100 and itr%10==0) or (itr<=1000 and itr%100==0)\
	or (itr<=10000 and itr%1000==0) or itr%10000==0:
	lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
	print 'iteration %*d: log likelihood of observed labels = %.8f' %\
	(int(np.ceil(np.log10(max_iter))), itr, lp)
	return coefficients
	initial_coefficients = np.zeros(194)
	step_size = 1e-7
	max_iter = 301
	#log likelihood increase
	coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
	#predict sentiments
	sentiment_predict = np.dot(feature_matrix, coefficients)
	sum(sentiment_predict>0) #25126 positive
	predict = np.sign(sentiment_predict)
	#measure accuracy
	print (sum(sentiment==predict)+0.0)/len(sentiment) # 0.7519
	#find the most positive words
	coefficients = list(coefficients[1:])
	word_coefficient_tuples =[(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
	word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
	print word_coefficient_tuples[0:10]