-
-
Save vene/fab06038a00309569cdd to your computer and use it in GitHub Desktop.
| """l-bfgs-b L1-Logistic Regression solver""" | |
| # Author: Vlad Niculae <[email protected]> | |
| # Suggested by Mathieu Blondel | |
| from __future__ import division, print_function | |
| import numpy as np | |
| from scipy.optimize import fmin_l_bfgs_b | |
| from sklearn.base import BaseEstimator, ClassifierMixin | |
| from sklearn.utils.extmath import safe_sparse_dot, log_logistic | |
| from sklearn.utils.fixes import expit | |
| def _l1_logistic_loss_grad(w_extended, X, y, alpha): | |
| _, n_features = X.shape | |
| w = w_extended[:n_features] - w_extended[n_features:] | |
| yz = y * safe_sparse_dot(X, w) | |
| # Logistic loss is the negative of the log of the logistic function. | |
| out = -np.sum(log_logistic(yz)) | |
| # out += .5 * alpha * np.dot(w, w) # L2 | |
| out += alpha * w_extended.sum() # L1, w_extended is non-negative | |
| z = expit(yz) | |
| z0 = (z - 1) * y | |
| grad = safe_sparse_dot(X.T, z0) | |
| grad = np.concatenate([grad, -grad]) | |
| # grad += alpha * w # L2 | |
| grad += alpha # L1 | |
| return out, grad | |
| class LbfgsL1Logistic(BaseEstimator, ClassifierMixin): | |
| def __init__(self, tol=1e-3, alpha=1.0): | |
| """Logistic Regression Lasso solved by L-BFGS-B | |
| Solves the same objective as sklearn.linear_model.LogisticRegression | |
| Parameters | |
| ---------- | |
| alpha: float, default: 1.0 | |
| The amount of regularization to use. | |
| tol: float, default: 1e-3 | |
| Convergence tolerance for L-BFGS-B. | |
| """ | |
| self.tol = tol | |
| self.alpha = alpha | |
| def fit(self, X, y): | |
| n_samples, n_features = X.shape | |
| coef0 = np.zeros(2 * n_features) | |
| w, f, d = fmin_l_bfgs_b(_l1_logistic_loss_grad, x0=coef0, fprime=None, | |
| pgtol=self.tol, | |
| bounds=[(0, None)] * n_features * 2, | |
| args=(X, y, self.alpha)) | |
| self.coef_ = w[:n_features] - w[n_features:] | |
| return self | |
| def predict(self, X): | |
| return np.sign(safe_sparse_dot(X, self.coef_)) | |
| if __name__ == '__main__': | |
| from scipy.spatial.distance import jaccard | |
| from sklearn.linear_model import LogisticRegression | |
| from time import time | |
| # Generate data with known sparsity pattern | |
| n_samples, n_features, n_relevant = 100, 80, 20 | |
| X = np.random.randn(n_samples, n_features) | |
| X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
| true_coef = np.zeros(n_features) | |
| nonzero_idx = np.random.randint(n_features, size=n_relevant) | |
| true_coef[nonzero_idx] = np.random.randn(n_relevant) | |
| y = np.dot(X, true_coef) + np.random.randn(n_samples) * 0.01 | |
| # classification, note: y must be {-1, +1} | |
| y = np.sign(y) | |
| C = 1.0 | |
| # Run this solver | |
| t0 = time() | |
| lasso_1 = LbfgsL1Logistic(alpha=1. / C, tol=1e-8).fit(X, y) | |
| t0 = time() - t0 | |
| print("l-bfgs-b: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
| "Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
| np.linalg.norm(lasso_1.coef_ - true_coef), | |
| jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
| # run liblinear | |
| t0 = time() | |
| lasso_2 = LogisticRegression(penalty='l1', C=C, tol=1e-8, | |
| fit_intercept=False).fit( X, y) | |
| t0 = time() - t0 | |
| print("liblinear: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
| "Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
| np.linalg.norm(lasso_2.coef_ - true_coef), | |
| jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
| """l-bfgs-b Lasso solver""" | |
| # Author: Vlad Niculae <[email protected]> | |
| # Suggested by Mathieu Blondel | |
| from __future__ import division, print_function | |
| import numpy as np | |
| from scipy.optimize import fmin_l_bfgs_b | |
| from sklearn.base import BaseEstimator, RegressorMixin | |
| from sklearn.utils.extmath import safe_sparse_dot | |
| class LbfgsLasso(BaseEstimator, RegressorMixin): | |
| def __init__(self, tol=1e-3, alpha=1.0): | |
| """Lasso solved by L-BFGS-B | |
| Solves the same objective as sklearn.linear_model.Lasso | |
| Parameters | |
| ---------- | |
| alpha: float, default: 1.0 | |
| The amount of regularization to use. | |
| tol: float, default: 1e-3 | |
| Convergence tolerance for L-BFGS-B. | |
| """ | |
| self.tol = tol | |
| self.alpha = alpha | |
| def fit(self, X, y): | |
| n_samples, n_features = X.shape | |
| def f(w_extended, *args): | |
| # w_extended is [w_pos -w_neg] | |
| # assert (w_extended >= 0).all() | |
| # abs(w).sum() = w_extended.sum() | |
| w = w_extended[:n_features] - w_extended[n_features:] | |
| return (np.sum((safe_sparse_dot(X, w) - y) ** 2) + | |
| (2 * self.alpha * n_samples) * w_extended.sum()) | |
| def fprime(w_extended, *args): | |
| # assert (w_extended >= 0).all() | |
| # dloss_extended = [X' -X']' * ([X -X]' * [w_pos -w_neg] - y) | |
| # = [X' -X']' * (X * w - y) | |
| # = [dloss -dloss] | |
| w = w_extended[:n_features] - w_extended[n_features:] | |
| dloss = 2 * safe_sparse_dot(X.T, safe_sparse_dot(X, w) - y) | |
| grad_extended = np.concatenate([dloss, -dloss]) | |
| grad_extended += 2 * self.alpha * n_samples # to match skl lasso | |
| return grad_extended | |
| coef0 = np.zeros(2 * n_features) | |
| w, f, d = fmin_l_bfgs_b(f, x0=coef0, fprime=fprime, pgtol=self.tol, | |
| bounds=[(0, None)] * n_features * 2) | |
| self.coef_ = w[:n_features] - w[n_features:] | |
| return self | |
| def predict(self, X): | |
| return safe_sparse_dot(X, self.coef_) | |
| if __name__ == '__main__': | |
| from scipy.spatial.distance import jaccard | |
| from sklearn.linear_model import Lasso | |
| from time import time | |
| # Generate data with known sparsity pattern | |
| X = np.random.randn(100, 80) | |
| X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
| true_coef = np.zeros(80) | |
| nonzero_idx = np.random.randint(80, size=20) | |
| true_coef[nonzero_idx] = np.random.randn(20) | |
| y = np.dot(X, true_coef) + np.random.randn(100) * 0.01 | |
| # Run this solver | |
| t0 = time() | |
| lasso_1 = LbfgsLasso(alpha=0.001, tol=1e-8).fit(X, y) | |
| t0 = time() - t0 | |
| print("LBFGSB: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
| "Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
| np.linalg.norm(lasso_1.coef_ - true_coef), | |
| jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
| # Run coordinate descent Lasso from scikit-learn | |
| t0 = time() | |
| lasso_2 = Lasso(alpha=0.001, fit_intercept=False, tol=1e-8).fit(X, y) | |
| t0 = time() - t0 | |
| print("CD: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
| "Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
| np.linalg.norm(lasso_2.coef_ - true_coef), | |
| jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
Hey @MLnick I'm also wondering that. Do you know the answer already?
I believe the loss function in LbfgsL1Logistic is incorrect. I didn't look at the other class. I can't speak for the gradient aspect of the calculation.
Loss for logistic regression includes terms for both the positive and negative class:
h = safe_sparse_dot(X,w)
(y * np.log(h + eps)) + ((1-y) * np.log(1-h + eps))
... or something similar.
Also, the user may or may not wish to include a bias term (perhaps this could be achieved by mean-centering y prior to fitting, or including bias in the calculation of h per this comment)
Oops sorry I realize I missed some very old questions on here!
@MLnick @jpvcaetano: I think you're correct and you could include an L2 penalty leading to elastic net, but don't, there are much better implementations, this is a proof of concept.
@kyle-pena-nlp I think you missed that in this code y is in {-1, 1} rather than {0, 1} (see line 86). So we have that:
P(Y=positive class} = sigmoid(h)
P(Y=negative class} = sigmoid(-h)
so log P(Y=y) = log_sigmoid(yh).
I agree that a bias is useful in practice, but also you wouldn't want to use this in practice, right? (if you do -- why?)
vene, you are correct. I missed that detail.
All the best!
@vene this is cool - just wanted to double check, one could do elastic net with uncommenting https://gist.github.com/vene/fab06038a00309569cdd#file-lbfgs_l1logistic-py-L24 and setting the relative elastic net weights for regularization param?