Last active
January 14, 2023 20:30
-
-
Save vene/fab06038a00309569cdd to your computer and use it in GitHub Desktop.
Solving L1-regularized problems with l-bfgs-b
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""l-bfgs-b L1-Logistic Regression solver""" | |
# Author: Vlad Niculae <[email protected]> | |
# Suggested by Mathieu Blondel | |
from __future__ import division, print_function | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
from sklearn.utils.extmath import safe_sparse_dot, log_logistic | |
from sklearn.utils.fixes import expit | |
def _l1_logistic_loss_grad(w_extended, X, y, alpha): | |
_, n_features = X.shape | |
w = w_extended[:n_features] - w_extended[n_features:] | |
yz = y * safe_sparse_dot(X, w) | |
# Logistic loss is the negative of the log of the logistic function. | |
out = -np.sum(log_logistic(yz)) | |
# out += .5 * alpha * np.dot(w, w) # L2 | |
out += alpha * w_extended.sum() # L1, w_extended is non-negative | |
z = expit(yz) | |
z0 = (z - 1) * y | |
grad = safe_sparse_dot(X.T, z0) | |
grad = np.concatenate([grad, -grad]) | |
# grad += alpha * w # L2 | |
grad += alpha # L1 | |
return out, grad | |
class LbfgsL1Logistic(BaseEstimator, ClassifierMixin): | |
def __init__(self, tol=1e-3, alpha=1.0): | |
"""Logistic Regression Lasso solved by L-BFGS-B | |
Solves the same objective as sklearn.linear_model.LogisticRegression | |
Parameters | |
---------- | |
alpha: float, default: 1.0 | |
The amount of regularization to use. | |
tol: float, default: 1e-3 | |
Convergence tolerance for L-BFGS-B. | |
""" | |
self.tol = tol | |
self.alpha = alpha | |
def fit(self, X, y): | |
n_samples, n_features = X.shape | |
coef0 = np.zeros(2 * n_features) | |
w, f, d = fmin_l_bfgs_b(_l1_logistic_loss_grad, x0=coef0, fprime=None, | |
pgtol=self.tol, | |
bounds=[(0, None)] * n_features * 2, | |
args=(X, y, self.alpha)) | |
self.coef_ = w[:n_features] - w[n_features:] | |
return self | |
def predict(self, X): | |
return np.sign(safe_sparse_dot(X, self.coef_)) | |
if __name__ == '__main__': | |
from scipy.spatial.distance import jaccard | |
from sklearn.linear_model import LogisticRegression | |
from time import time | |
# Generate data with known sparsity pattern | |
n_samples, n_features, n_relevant = 100, 80, 20 | |
X = np.random.randn(n_samples, n_features) | |
X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
true_coef = np.zeros(n_features) | |
nonzero_idx = np.random.randint(n_features, size=n_relevant) | |
true_coef[nonzero_idx] = np.random.randn(n_relevant) | |
y = np.dot(X, true_coef) + np.random.randn(n_samples) * 0.01 | |
# classification, note: y must be {-1, +1} | |
y = np.sign(y) | |
C = 1.0 | |
# Run this solver | |
t0 = time() | |
lasso_1 = LbfgsL1Logistic(alpha=1. / C, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("l-bfgs-b: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
np.linalg.norm(lasso_1.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
# run liblinear | |
t0 = time() | |
lasso_2 = LogisticRegression(penalty='l1', C=C, tol=1e-8, | |
fit_intercept=False).fit( X, y) | |
t0 = time() - t0 | |
print("liblinear: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
np.linalg.norm(lasso_2.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""l-bfgs-b Lasso solver""" | |
# Author: Vlad Niculae <[email protected]> | |
# Suggested by Mathieu Blondel | |
from __future__ import division, print_function | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
from sklearn.base import BaseEstimator, RegressorMixin | |
from sklearn.utils.extmath import safe_sparse_dot | |
class LbfgsLasso(BaseEstimator, RegressorMixin): | |
def __init__(self, tol=1e-3, alpha=1.0): | |
"""Lasso solved by L-BFGS-B | |
Solves the same objective as sklearn.linear_model.Lasso | |
Parameters | |
---------- | |
alpha: float, default: 1.0 | |
The amount of regularization to use. | |
tol: float, default: 1e-3 | |
Convergence tolerance for L-BFGS-B. | |
""" | |
self.tol = tol | |
self.alpha = alpha | |
def fit(self, X, y): | |
n_samples, n_features = X.shape | |
def f(w_extended, *args): | |
# w_extended is [w_pos -w_neg] | |
# assert (w_extended >= 0).all() | |
# abs(w).sum() = w_extended.sum() | |
w = w_extended[:n_features] - w_extended[n_features:] | |
return (np.sum((safe_sparse_dot(X, w) - y) ** 2) + | |
(2 * self.alpha * n_samples) * w_extended.sum()) | |
def fprime(w_extended, *args): | |
# assert (w_extended >= 0).all() | |
# dloss_extended = [X' -X']' * ([X -X]' * [w_pos -w_neg] - y) | |
# = [X' -X']' * (X * w - y) | |
# = [dloss -dloss] | |
w = w_extended[:n_features] - w_extended[n_features:] | |
dloss = 2 * safe_sparse_dot(X.T, safe_sparse_dot(X, w) - y) | |
grad_extended = np.concatenate([dloss, -dloss]) | |
grad_extended += 2 * self.alpha * n_samples # to match skl lasso | |
return grad_extended | |
coef0 = np.zeros(2 * n_features) | |
w, f, d = fmin_l_bfgs_b(f, x0=coef0, fprime=fprime, pgtol=self.tol, | |
bounds=[(0, None)] * n_features * 2) | |
self.coef_ = w[:n_features] - w[n_features:] | |
return self | |
def predict(self, X): | |
return safe_sparse_dot(X, self.coef_) | |
if __name__ == '__main__': | |
from scipy.spatial.distance import jaccard | |
from sklearn.linear_model import Lasso | |
from time import time | |
# Generate data with known sparsity pattern | |
X = np.random.randn(100, 80) | |
X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
true_coef = np.zeros(80) | |
nonzero_idx = np.random.randint(80, size=20) | |
true_coef[nonzero_idx] = np.random.randn(20) | |
y = np.dot(X, true_coef) + np.random.randn(100) * 0.01 | |
# Run this solver | |
t0 = time() | |
lasso_1 = LbfgsLasso(alpha=0.001, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("LBFGSB: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
np.linalg.norm(lasso_1.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
# Run coordinate descent Lasso from scikit-learn | |
t0 = time() | |
lasso_2 = Lasso(alpha=0.001, fit_intercept=False, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("CD: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
np.linalg.norm(lasso_2.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
Oops sorry I realize I missed some very old questions on here!
@MLnick @jpvcaetano: I think you're correct and you could include an L2 penalty leading to elastic net, but don't, there are much better implementations, this is a proof of concept.
@kyle-pena-nlp I think you missed that in this code y is in {-1, 1} rather than {0, 1} (see line 86). So we have that:
P(Y=positive class} = sigmoid(h)
P(Y=negative class} = sigmoid(-h)
so log P(Y=y) = log_sigmoid(yh).
I agree that a bias is useful in practice, but also you wouldn't want to use this in practice, right? (if you do -- why?)
vene, you are correct. I missed that detail.
All the best!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I believe the loss function in LbfgsL1Logistic is incorrect. I didn't look at the other class. I can't speak for the gradient aspect of the calculation.
Loss for logistic regression includes terms for both the positive and negative class:
h = safe_sparse_dot(X,w)
(y * np.log(h + eps)) + ((1-y) * np.log(1-h + eps))
... or something similar.
Also, the user may or may not wish to include a bias term (perhaps this could be achieved by mean-centering y prior to fitting, or including bias in the calculation of h per this comment)