Skip to content

Instantly share code, notes, and snippets.

@vene
Last active January 14, 2023 20:30
Show Gist options
  • Save vene/fab06038a00309569cdd to your computer and use it in GitHub Desktop.
Save vene/fab06038a00309569cdd to your computer and use it in GitHub Desktop.
Solving L1-regularized problems with l-bfgs-b
"""l-bfgs-b L1-Logistic Regression solver"""
# Author: Vlad Niculae <[email protected]>
# Suggested by Mathieu Blondel
from __future__ import division, print_function
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.extmath import safe_sparse_dot, log_logistic
from sklearn.utils.fixes import expit
def _l1_logistic_loss_grad(w_extended, X, y, alpha):
_, n_features = X.shape
w = w_extended[:n_features] - w_extended[n_features:]
yz = y * safe_sparse_dot(X, w)
# Logistic loss is the negative of the log of the logistic function.
out = -np.sum(log_logistic(yz))
# out += .5 * alpha * np.dot(w, w) # L2
out += alpha * w_extended.sum() # L1, w_extended is non-negative
z = expit(yz)
z0 = (z - 1) * y
grad = safe_sparse_dot(X.T, z0)
grad = np.concatenate([grad, -grad])
# grad += alpha * w # L2
grad += alpha # L1
return out, grad
class LbfgsL1Logistic(BaseEstimator, ClassifierMixin):
def __init__(self, tol=1e-3, alpha=1.0):
"""Logistic Regression Lasso solved by L-BFGS-B
Solves the same objective as sklearn.linear_model.LogisticRegression
Parameters
----------
alpha: float, default: 1.0
The amount of regularization to use.
tol: float, default: 1e-3
Convergence tolerance for L-BFGS-B.
"""
self.tol = tol
self.alpha = alpha
def fit(self, X, y):
n_samples, n_features = X.shape
coef0 = np.zeros(2 * n_features)
w, f, d = fmin_l_bfgs_b(_l1_logistic_loss_grad, x0=coef0, fprime=None,
pgtol=self.tol,
bounds=[(0, None)] * n_features * 2,
args=(X, y, self.alpha))
self.coef_ = w[:n_features] - w[n_features:]
return self
def predict(self, X):
return np.sign(safe_sparse_dot(X, self.coef_))
if __name__ == '__main__':
from scipy.spatial.distance import jaccard
from sklearn.linear_model import LogisticRegression
from time import time
# Generate data with known sparsity pattern
n_samples, n_features, n_relevant = 100, 80, 20
X = np.random.randn(n_samples, n_features)
X /= np.linalg.norm(X, axis=1)[:, np.newaxis]
true_coef = np.zeros(n_features)
nonzero_idx = np.random.randint(n_features, size=n_relevant)
true_coef[nonzero_idx] = np.random.randn(n_relevant)
y = np.dot(X, true_coef) + np.random.randn(n_samples) * 0.01
# classification, note: y must be {-1, +1}
y = np.sign(y)
C = 1.0
# Run this solver
t0 = time()
lasso_1 = LbfgsL1Logistic(alpha=1. / C, tol=1e-8).fit(X, y)
t0 = time() - t0
print("l-bfgs-b: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} "
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y),
np.linalg.norm(lasso_1.coef_ - true_coef),
jaccard(true_coef > 0, lasso_1.coef_ > 0)))
# run liblinear
t0 = time()
lasso_2 = LogisticRegression(penalty='l1', C=C, tol=1e-8,
fit_intercept=False).fit( X, y)
t0 = time() - t0
print("liblinear: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} "
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y),
np.linalg.norm(lasso_2.coef_ - true_coef),
jaccard(true_coef > 0, lasso_2.coef_ > 0)))
"""l-bfgs-b Lasso solver"""
# Author: Vlad Niculae <[email protected]>
# Suggested by Mathieu Blondel
from __future__ import division, print_function
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.extmath import safe_sparse_dot
class LbfgsLasso(BaseEstimator, RegressorMixin):
def __init__(self, tol=1e-3, alpha=1.0):
"""Lasso solved by L-BFGS-B
Solves the same objective as sklearn.linear_model.Lasso
Parameters
----------
alpha: float, default: 1.0
The amount of regularization to use.
tol: float, default: 1e-3
Convergence tolerance for L-BFGS-B.
"""
self.tol = tol
self.alpha = alpha
def fit(self, X, y):
n_samples, n_features = X.shape
def f(w_extended, *args):
# w_extended is [w_pos -w_neg]
# assert (w_extended >= 0).all()
# abs(w).sum() = w_extended.sum()
w = w_extended[:n_features] - w_extended[n_features:]
return (np.sum((safe_sparse_dot(X, w) - y) ** 2) +
(2 * self.alpha * n_samples) * w_extended.sum())
def fprime(w_extended, *args):
# assert (w_extended >= 0).all()
# dloss_extended = [X' -X']' * ([X -X]' * [w_pos -w_neg] - y)
# = [X' -X']' * (X * w - y)
# = [dloss -dloss]
w = w_extended[:n_features] - w_extended[n_features:]
dloss = 2 * safe_sparse_dot(X.T, safe_sparse_dot(X, w) - y)
grad_extended = np.concatenate([dloss, -dloss])
grad_extended += 2 * self.alpha * n_samples # to match skl lasso
return grad_extended
coef0 = np.zeros(2 * n_features)
w, f, d = fmin_l_bfgs_b(f, x0=coef0, fprime=fprime, pgtol=self.tol,
bounds=[(0, None)] * n_features * 2)
self.coef_ = w[:n_features] - w[n_features:]
return self
def predict(self, X):
return safe_sparse_dot(X, self.coef_)
if __name__ == '__main__':
from scipy.spatial.distance import jaccard
from sklearn.linear_model import Lasso
from time import time
# Generate data with known sparsity pattern
X = np.random.randn(100, 80)
X /= np.linalg.norm(X, axis=1)[:, np.newaxis]
true_coef = np.zeros(80)
nonzero_idx = np.random.randint(80, size=20)
true_coef[nonzero_idx] = np.random.randn(20)
y = np.dot(X, true_coef) + np.random.randn(100) * 0.01
# Run this solver
t0 = time()
lasso_1 = LbfgsLasso(alpha=0.001, tol=1e-8).fit(X, y)
t0 = time() - t0
print("LBFGSB: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} "
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y),
np.linalg.norm(lasso_1.coef_ - true_coef),
jaccard(true_coef > 0, lasso_1.coef_ > 0)))
# Run coordinate descent Lasso from scikit-learn
t0 = time()
lasso_2 = Lasso(alpha=0.001, fit_intercept=False, tol=1e-8).fit(X, y)
t0 = time() - t0
print("CD: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} "
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y),
np.linalg.norm(lasso_2.coef_ - true_coef),
jaccard(true_coef > 0, lasso_2.coef_ > 0)))
@jpvcaetano
Copy link

Hey @MLnick I'm also wondering that. Do you know the answer already?

@kyle-pena-nlp
Copy link

kyle-pena-nlp commented Mar 9, 2021

I believe the loss function in LbfgsL1Logistic is incorrect. I didn't look at the other class. I can't speak for the gradient aspect of the calculation.

Loss for logistic regression includes terms for both the positive and negative class:

h = safe_sparse_dot(X,w)
(y * np.log(h + eps)) + ((1-y) * np.log(1-h + eps))
... or something similar.

Also, the user may or may not wish to include a bias term (perhaps this could be achieved by mean-centering y prior to fitting, or including bias in the calculation of h per this comment)

@vene
Copy link
Author

vene commented Mar 10, 2021

Oops sorry I realize I missed some very old questions on here!

@MLnick @jpvcaetano: I think you're correct and you could include an L2 penalty leading to elastic net, but don't, there are much better implementations, this is a proof of concept.

@kyle-pena-nlp I think you missed that in this code y is in {-1, 1} rather than {0, 1} (see line 86). So we have that:

P(Y=positive class} = sigmoid(h)
P(Y=negative class} = sigmoid(-h)

so log P(Y=y) = log_sigmoid(yh).

I agree that a bias is useful in practice, but also you wouldn't want to use this in practice, right? (if you do -- why?)

@kyle-pena-nlp
Copy link

vene, you are correct. I missed that detail.
All the best!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment