Last active
March 13, 2018 02:48
-
-
Save lotabout/94c68304f23d0e0c06ad12a1334462cd to your computer and use it in GitHub Desktop.
Experiment on logic regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from sklearn.datasets import load_iris | |
import numpy as np | |
import math | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from scipy.optimize import fmin_bfgs | |
#============================================================================== | |
# Circular Data | |
data_circular = np.array([[0.051267,0.69956,1], | |
[-0.092742,0.68494,1], | |
[-0.21371,0.69225,1], | |
[-0.375,0.50219,1], | |
[-0.51325,0.46564,1], | |
[-0.52477,0.2098,1], | |
[-0.39804,0.034357,1], | |
[-0.30588,-0.19225,1], | |
[0.016705,-0.40424,1], | |
[0.13191,-0.51389,1], | |
[0.38537,-0.56506,1], | |
[0.52938,-0.5212,1], | |
[0.63882,-0.24342,1], | |
[0.73675,-0.18494,1], | |
[0.54666,0.48757,1], | |
[0.322,0.5826,1], | |
[0.16647,0.53874,1], | |
[-0.046659,0.81652,1], | |
[-0.17339,0.69956,1], | |
[-0.47869,0.63377,1], | |
[-0.60541,0.59722,1], | |
[-0.62846,0.33406,1], | |
[-0.59389,0.005117,1], | |
[-0.42108,-0.27266,1], | |
[-0.11578,-0.39693,1], | |
[0.20104,-0.60161,1], | |
[0.46601,-0.53582,1], | |
[0.67339,-0.53582,1], | |
[-0.13882,0.54605,1], | |
[-0.29435,0.77997,1], | |
[-0.26555,0.96272,1], | |
[-0.16187,0.8019,1], | |
[-0.17339,0.64839,1], | |
[-0.28283,0.47295,1], | |
[-0.36348,0.31213,1], | |
[-0.30012,0.027047,1], | |
[-0.23675,-0.21418,1], | |
[-0.06394,-0.18494,1], | |
[0.062788,-0.16301,1], | |
[0.22984,-0.41155,1], | |
[0.2932,-0.2288,1], | |
[0.48329,-0.18494,1], | |
[0.64459,-0.14108,1], | |
[0.46025,0.012427,1], | |
[0.6273,0.15863,1], | |
[0.57546,0.26827,1], | |
[0.72523,0.44371,1], | |
[0.22408,0.52412,1], | |
[0.44297,0.67032,1], | |
[0.322,0.69225,1], | |
[0.13767,0.57529,1], | |
[-0.0063364,0.39985,1], | |
[-0.092742,0.55336,1], | |
[-0.20795,0.35599,1], | |
[-0.20795,0.17325,1], | |
[-0.43836,0.21711,1], | |
[-0.21947,-0.016813,1], | |
[-0.13882,-0.27266,1], | |
[0.18376,0.93348,0], | |
[0.22408,0.77997,0], | |
[0.29896,0.61915,0], | |
[0.50634,0.75804,0], | |
[0.61578,0.7288,0], | |
[0.60426,0.59722,0], | |
[0.76555,0.50219,0], | |
[0.92684,0.3633,0], | |
[0.82316,0.27558,0], | |
[0.96141,0.085526,0], | |
[0.93836,0.012427,0], | |
[0.86348,-0.082602,0], | |
[0.89804,-0.20687,0], | |
[0.85196,-0.36769,0], | |
[0.82892,-0.5212,0], | |
[0.79435,-0.55775,0], | |
[0.59274,-0.7405,0], | |
[0.51786,-0.5943,0], | |
[0.46601,-0.41886,0], | |
[0.35081,-0.57968,0], | |
[0.28744,-0.76974,0], | |
[0.085829,-0.75512,0], | |
[0.14919,-0.57968,0], | |
[-0.13306,-0.4481,0], | |
[-0.40956,-0.41155,0], | |
[-0.39228,-0.25804,0], | |
[-0.74366,-0.25804,0], | |
[-0.69758,0.041667,0], | |
[-0.75518,0.2902,0], | |
[-0.69758,0.68494,0], | |
[-0.4038,0.70687,0], | |
[-0.38076,0.91886,0], | |
[-0.50749,0.90424,0], | |
[-0.54781,0.70687,0], | |
[0.10311,0.77997,0], | |
[0.057028,0.91886,0], | |
[-0.10426,0.99196,0], | |
[-0.081221,1.1089,0], | |
[0.28744,1.087,0], | |
[0.39689,0.82383,0], | |
[0.63882,0.88962,0], | |
[0.82316,0.66301,0], | |
[0.67339,0.64108,0], | |
[1.0709,0.10015,0], | |
[-0.046659,-0.57968,0], | |
[-0.23675,-0.63816,0], | |
[-0.15035,-0.36769,0], | |
[-0.49021,-0.3019,0], | |
[-0.46717,-0.13377,0], | |
[-0.28859,-0.060673,0], | |
[-0.61118,-0.067982,0], | |
[-0.66302,-0.21418,0], | |
[-0.59965,-0.41886,0], | |
[-0.72638,-0.082602,0], | |
[-0.83007,0.31213,0], | |
[-0.72062,0.53874,0], | |
[-0.59389,0.49488,0], | |
[-0.48445,0.99927,0], | |
[-0.0063364,0.99927,0], | |
[0.63265,-0.030612,0]]) | |
#============================================================================== | |
# helpers to plot decision boundary | |
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py | |
def make_meshgrid(x, y, h=.02): | |
x_min, x_max = x.min() - 1, x.max() + 1 | |
y_min, y_max = y.min() - 1, y.max() + 1 | |
return np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) | |
def plot_contours(predict, xx, yy, **params): | |
Z = predict(np.c_[xx.ravel(), yy.ravel()]) | |
Z = Z.reshape(xx.shape) | |
ax = params.get('ax', plt) | |
return ax.contourf(xx, yy, Z, **params) | |
#============================================================================== | |
# Implementation of Logistic Regression | |
def sigmoid(x): | |
return 1/(1+np.exp(-x)) | |
def h(X, theta): | |
"""the hypothesis function | |
:X: a matrix, each row is a sample with columns of features | |
:theta: an array represents the parameter of the regression array([theta_0, theta_1, ...]) | |
:returns: array([h_0, h_1]), hypothes of each data | |
""" | |
product = np.matmul(X, theta) | |
return sigmoid(product) | |
def gen_predict(theta): | |
def predict(X): | |
return np.where(h(X, theta) >= 0.5, 1, 0) | |
return predict | |
def gen_cost(data, labels, h, **params): | |
""" | |
:data: a matrix array([[col, col], ...]) | |
:labels: an array that contains the label for all data | |
:returns: a cost function that takes theta and return (current_cost, array([gradient])) | |
""" | |
m = len(data) # number of samples | |
x_trans = data.transpose() | |
y = labels | |
def cost(theta): | |
""":returns: (current_cost, gradient)""" | |
hx = h(data, theta) | |
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m | |
grad = np.matmul(x_trans, (hx-y)) / m | |
return (J, grad) | |
return cost | |
def gen_cost_regularized(data, labels, h, l=1, **params): | |
m = len(data) # number of samples | |
x_trans = data.transpose() | |
y = labels | |
def cost(theta): | |
""":returns: (current_cost, gradient)""" | |
hx = h(data, theta) | |
J = sum(- y*np.log(hx) - (1-y)*np.log(1-hx)) / m + l/2/m * sum(np.square(theta)) | |
grad = np.matmul(x_trans, (hx-y)) / m + l/m*theta | |
return (J, grad) | |
return cost | |
def gradDescent(X, Y, gen_cost=gen_cost, iterations=1000, a=0.1, tol=1e-3, record_cost=False): | |
"""train a group of theta | |
:returns: a vector represents theta | |
""" | |
cost = gen_cost(X, Y, h) | |
rows, cols = X.shape | |
theta = np.zeros(cols) | |
# now train | |
costs = [] | |
for i in range(iterations): | |
cur_cost, grad = cost(theta) | |
print(f"current_cost: {cur_cost}") | |
if record_cost: | |
costs.append([i, cur_cost]) | |
if cur_cost < tol: | |
print(f'ends with cost: {cur_cost}') | |
break | |
theta = theta - a * grad | |
if record_cost: | |
return (theta, np.array(costs)) | |
return theta | |
# train logistic for iris | |
iris = load_iris() | |
X = iris.data[iris.target != 2] # drop the 3rd category | |
Y = iris.target[iris.target != 2] | |
# split training set and test sets | |
ids = np.random.permutation(X.shape[0]) | |
train_ids, test_ids = ids[:70], ids[70:] | |
X_train, Y_train = X[train_ids, :], Y[train_ids] | |
X_test, Y_test = X[test_ids, :], Y[test_ids] | |
# train | |
rows, cols = X_train.shape | |
# bias = np.ones(rows) | |
# X_train = np.insert(X_train, 0, bias, axis=1) | |
theta = gradDescent(X_train, Y_train, iterations=100) | |
predict = gen_predict(theta) | |
predicted = predict(X_test) | |
print(f"correct rate: {np.count_nonzero(predicted==Y_test)/len(Y_test)*100}%") | |
# plot 2D | |
dims = (1, 2) | |
X0, X1 = X[:, dims[0]], X[:, dims[1]] | |
xx, yy = make_meshgrid(X0, X1) | |
# plot the data | |
predict = gen_predict(theta[list(dims)]) | |
plot_contours(predict, xx, yy, cmap=plt.cm.Blues, alpha=0.8) | |
plt.scatter(X0, X1, c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
#============================================================================== | |
# plot decision boundary of Logistic Regression in 3D | |
# train for only 2 features | |
X = iris.data[iris.target != 2][:, [1, 2]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
theta = gradDescent(X, Y, iterations=100) | |
predict = lambda X: h(X, theta) | |
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01) | |
Z = predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(X_0[:, 0], X_0[:, 1], c='r') | |
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g') | |
ax.plot_wireframe(xx, yy, Z, alpha=0.8) | |
ax.contourf(xx, yy, Z, 1, zdir='z', offset=0) | |
plt.show() | |
#============================================================================== | |
X = data_circular[:,[0,1]] | |
Y = data_circular[:,2] | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
# add non-linear features | |
def add_non_linear_features(X0, X1): | |
ret = [] | |
for rank in range(1, 7): | |
for rank_x0 in range(rank+1): | |
ret.append((X0**rank_x0) * (X1**(rank-rank_x0))) | |
return np.column_stack(ret) | |
X_mapping = add_non_linear_features(X[:,0], X[:,1]) | |
# use fmin_bfgs to train, gradDescent is too slow for this | |
cost = gen_cost(X_mapping, Y, h) | |
cost_for_bfgs = lambda theta: cost(theta)[0] | |
gradient_for_bfgs = lambda theta: cost(theta)[1] | |
initial_theta = np.zeros(X_mapping.shape[1]) | |
theta = fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs, gtol=1e-6) | |
# theta = gradDescent(X_mapping, Y, iterations=1000000, a=10) | |
# plot decision boundary in 2D | |
predict = gen_predict(theta) | |
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01) | |
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel())) | |
Z = Z.reshape(xx.shape) | |
plt.contourf(xx, yy, Z, alpha=0.8) | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.show() | |
#============================================================================== | |
# prove that square cost won't work | |
samples = [(-5, 1), (-20, 0), (-2, 1)] | |
def cost(theta): | |
def sigmoid(theta, x): | |
return 1/(1 + math.e**(- theta*x)) | |
diffs = [(sigmoid(theta, x) - y) for x,y in samples] | |
return sum(diff * diff for diff in diffs)/len(samples)/2 | |
X = np.arange(-1, 1, 0.01) | |
Y = np.array([cost(theta) for theta in X]) | |
plt.plot(X, Y) | |
plt.show() | |
#============================================================================== | |
# Visually grasp the impact of cost function | |
X = iris.data[iris.target != 2][:, [1, 2]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
cost = gen_cost(X, Y, h) | |
theta_good = gradDescent(X, Y, iterations=100) | |
predict_good = lambda X: h(X, theta_good) | |
theta_bad = gradDescent(X, Y, iterations=25) | |
predict_bad = lambda X: h(X, theta_bad) | |
xx, yy = make_meshgrid(X[:, 0], X[:, 1], h=0.01) | |
Z_good = predict_good(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
Z_bad = predict_bad(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) | |
# plot 3D | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(X_0[:, 0], X_0[:, 1], c='r') | |
ax.scatter(X_1[:, 0], X_1[:, 1], zs=1, c='g') | |
ax.plot_wireframe(xx, yy, Z_good, alpha=0.8) | |
ax.plot_wireframe(xx, yy, Z_bad, alpha=0.8, color='green') | |
plt.show() | |
# plot 2D | |
predict_good = gen_predict(theta_good) | |
predict_bad = gen_predict(theta_bad) | |
fig = plt.figure() | |
# ax1 = fig.add_subplot(121) | |
CS = plot_contours(predict_good, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.08'}, inline=True, fontsize=10) | |
CS = plot_contours(predict_bad, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'cost = 0.25'}, inline=True, fontsize=10) | |
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8) | |
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8) | |
plt.savefig('cost-function-2D.svg') | |
plt.show() | |
#============================================================================== | |
# Regularize experiment | |
X = data_circular[:,[0,1]] | |
Y = data_circular[:,2] | |
# add non-linear features | |
def add_non_linear_features(X0, X1): | |
ret = [] | |
for rank in range(1, 7): | |
for rank_x0 in range(rank+1): | |
ret.append((X0**rank_x0) * (X1**(rank-rank_x0))) | |
return np.column_stack(ret) | |
X_mapping = add_non_linear_features(X[:,0], X[:,1]) | |
# use fmin_bfgs to train, gradDescent is too slow for this | |
def train(gen_cost, **params): | |
initial_theta = np.zeros(X_mapping.shape[1]) | |
cost = gen_cost(X_mapping, Y, h, **params) | |
cost_for_bfgs = lambda theta: cost(theta)[0] | |
gradient_for_bfgs = lambda theta: cost(theta)[1] | |
return fmin_bfgs(cost_for_bfgs, initial_theta, fprime=gradient_for_bfgs) | |
# return gradDescent(X_mapping, Y, gen_cost_regularized, **params) | |
theta_not_regularized = train(gen_cost) | |
theta_regularized = train(gen_cost_regularized, l=0.005) | |
# plot decision boundary in 2D | |
def plot_circular(theta, plt=plt): | |
predict = gen_predict(theta) | |
xx, yy = make_meshgrid(X[:,0], X[:,1], h=.01) | |
Z = predict(add_non_linear_features(xx.ravel(), yy.ravel())) | |
Z = Z.reshape(xx.shape) | |
plt.scatter(X[:,0], X[:,1], c=np.where(Y==0, 'red', 'blue'), alpha=0.8) | |
plt.contourf(xx, yy, Z, alpha=0.6) | |
fig = plt.figure() | |
ax1 = fig.add_subplot(221) | |
l0 = train(gen_cost_regularized, l=0) | |
plot_circular(l0, plt=ax1) | |
ax1.set_title(r'$\lambda = 0$') | |
ax2 = fig.add_subplot(222) | |
l1 = train(gen_cost_regularized, l=0.001) | |
plot_circular(l1, plt=ax2) | |
ax2.set_title(r'$\lambda = 0.001$') | |
ax3 = fig.add_subplot(223) | |
l2 = train(gen_cost_regularized, l=0.01) | |
plot_circular(l2, plt=ax3) | |
ax3.set_title(r'$\lambda = 0.01$') | |
ax4 = fig.add_subplot(224) | |
l3 = train(gen_cost_regularized, l=0.1) | |
plot_circular(l3, plt=ax4) | |
ax4.set_title(r'$\lambda = 0.1$') | |
plt.show() | |
#============================================================================== | |
# Scaling | |
import sklearn.preprocessing as preprocessing | |
scaler = preprocessing.StandardScaler() | |
X_mapping_scaled = scaler.fit_transform(X_mapping) | |
theta_normal, costs_normal = gradDescent(X_mapping, Y, gen_cost, record_cost=True) | |
theta_scaled, costs_scaled = gradDescent(X_mapping_scaled, Y, gen_cost, record_cost=True) | |
not_scaled, = plt.plot(costs_normal[:,0], costs_normal[:,1], label='Not Scaled') | |
scaled, = plt.plot(costs_scaled[:,0], costs_scaled[:,1], label='Scaled') | |
plt.legend([not_scaled, scaled], ['Not scaled', 'Scaled']) | |
plt.xlabel('Number of Iterations') | |
plt.ylabel(r'$J(\theta)$') | |
plt.savefig('learning-rate-scaled.svg') | |
#============================================================================== | |
# With or without Bias | |
X = iris.data[iris.target != 2][:, [2, 3]] | |
Y = iris.target[iris.target != 2] | |
X_0, X_1 = X[Y==0], X[Y==1] | |
def add_bias(X): | |
bias = np.ones(X.shape[0]) | |
return np.insert(X, 0, bias, axis=1) | |
theta_no_bias = gradDescent(X, Y, iterations=1000) | |
predict_no_bias = lambda X: np.where(h(X, theta_no_bias)>=0.5, 1, 0) | |
X_with_bias = add_bias(X) | |
theta_with_bias = gradDescent(X_with_bias, Y, iterations=1000) | |
predict_with_bias = lambda X: np.where(h(add_bias(X), theta_with_bias)>=0.5, 1, 0) | |
x_min, x_max = -1, X[:,0].max() + 1 | |
y_min, y_max = X[:,1].min() - 1, X[:, 1].max() + 1 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01)) | |
fig = plt.figure() | |
CS = plot_contours(predict_no_bias, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'No Bias'}, inline=True, fontsize=16) | |
CS = plot_contours(predict_with_bias, xx, yy, alpha=0.5) | |
plt.clabel(CS, CS.levels[:2], fmt={CS.levels[1]: 'Bias'}, inline=True, fontsize=16) | |
plt.scatter(X_0[:, 0], X_0[:, 1], c='r', alpha=0.8) | |
plt.scatter(X_1[:, 0], X_1[:, 1], c='g', alpha=0.8) | |
plt.plot(np.linspace(-1,0), np.zeros(len(np.linspace(-1,0))), 'c--') | |
plt.plot(np.zeros(len(np.linspace(y_min,0))), np.linspace(y_min,0), 'c--') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment