-
-
Save vene/fab06038a00309569cdd to your computer and use it in GitHub Desktop.
"""l-bfgs-b L1-Logistic Regression solver""" | |
# Author: Vlad Niculae <[email protected]> | |
# Suggested by Mathieu Blondel | |
from __future__ import division, print_function | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
from sklearn.utils.extmath import safe_sparse_dot, log_logistic | |
from sklearn.utils.fixes import expit | |
def _l1_logistic_loss_grad(w_extended, X, y, alpha): | |
_, n_features = X.shape | |
w = w_extended[:n_features] - w_extended[n_features:] | |
yz = y * safe_sparse_dot(X, w) | |
# Logistic loss is the negative of the log of the logistic function. | |
out = -np.sum(log_logistic(yz)) | |
# out += .5 * alpha * np.dot(w, w) # L2 | |
out += alpha * w_extended.sum() # L1, w_extended is non-negative | |
z = expit(yz) | |
z0 = (z - 1) * y | |
grad = safe_sparse_dot(X.T, z0) | |
grad = np.concatenate([grad, -grad]) | |
# grad += alpha * w # L2 | |
grad += alpha # L1 | |
return out, grad | |
class LbfgsL1Logistic(BaseEstimator, ClassifierMixin): | |
def __init__(self, tol=1e-3, alpha=1.0): | |
"""Logistic Regression Lasso solved by L-BFGS-B | |
Solves the same objective as sklearn.linear_model.LogisticRegression | |
Parameters | |
---------- | |
alpha: float, default: 1.0 | |
The amount of regularization to use. | |
tol: float, default: 1e-3 | |
Convergence tolerance for L-BFGS-B. | |
""" | |
self.tol = tol | |
self.alpha = alpha | |
def fit(self, X, y): | |
n_samples, n_features = X.shape | |
coef0 = np.zeros(2 * n_features) | |
w, f, d = fmin_l_bfgs_b(_l1_logistic_loss_grad, x0=coef0, fprime=None, | |
pgtol=self.tol, | |
bounds=[(0, None)] * n_features * 2, | |
args=(X, y, self.alpha)) | |
self.coef_ = w[:n_features] - w[n_features:] | |
return self | |
def predict(self, X): | |
return np.sign(safe_sparse_dot(X, self.coef_)) | |
if __name__ == '__main__': | |
from scipy.spatial.distance import jaccard | |
from sklearn.linear_model import LogisticRegression | |
from time import time | |
# Generate data with known sparsity pattern | |
n_samples, n_features, n_relevant = 100, 80, 20 | |
X = np.random.randn(n_samples, n_features) | |
X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
true_coef = np.zeros(n_features) | |
nonzero_idx = np.random.randint(n_features, size=n_relevant) | |
true_coef[nonzero_idx] = np.random.randn(n_relevant) | |
y = np.dot(X, true_coef) + np.random.randn(n_samples) * 0.01 | |
# classification, note: y must be {-1, +1} | |
y = np.sign(y) | |
C = 1.0 | |
# Run this solver | |
t0 = time() | |
lasso_1 = LbfgsL1Logistic(alpha=1. / C, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("l-bfgs-b: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
np.linalg.norm(lasso_1.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
# run liblinear | |
t0 = time() | |
lasso_2 = LogisticRegression(penalty='l1', C=C, tol=1e-8, | |
fit_intercept=False).fit( X, y) | |
t0 = time() - t0 | |
print("liblinear: time = {:.4f}s acc = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
np.linalg.norm(lasso_2.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
"""l-bfgs-b Lasso solver""" | |
# Author: Vlad Niculae <[email protected]> | |
# Suggested by Mathieu Blondel | |
from __future__ import division, print_function | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
from sklearn.base import BaseEstimator, RegressorMixin | |
from sklearn.utils.extmath import safe_sparse_dot | |
class LbfgsLasso(BaseEstimator, RegressorMixin): | |
def __init__(self, tol=1e-3, alpha=1.0): | |
"""Lasso solved by L-BFGS-B | |
Solves the same objective as sklearn.linear_model.Lasso | |
Parameters | |
---------- | |
alpha: float, default: 1.0 | |
The amount of regularization to use. | |
tol: float, default: 1e-3 | |
Convergence tolerance for L-BFGS-B. | |
""" | |
self.tol = tol | |
self.alpha = alpha | |
def fit(self, X, y): | |
n_samples, n_features = X.shape | |
def f(w_extended, *args): | |
# w_extended is [w_pos -w_neg] | |
# assert (w_extended >= 0).all() | |
# abs(w).sum() = w_extended.sum() | |
w = w_extended[:n_features] - w_extended[n_features:] | |
return (np.sum((safe_sparse_dot(X, w) - y) ** 2) + | |
(2 * self.alpha * n_samples) * w_extended.sum()) | |
def fprime(w_extended, *args): | |
# assert (w_extended >= 0).all() | |
# dloss_extended = [X' -X']' * ([X -X]' * [w_pos -w_neg] - y) | |
# = [X' -X']' * (X * w - y) | |
# = [dloss -dloss] | |
w = w_extended[:n_features] - w_extended[n_features:] | |
dloss = 2 * safe_sparse_dot(X.T, safe_sparse_dot(X, w) - y) | |
grad_extended = np.concatenate([dloss, -dloss]) | |
grad_extended += 2 * self.alpha * n_samples # to match skl lasso | |
return grad_extended | |
coef0 = np.zeros(2 * n_features) | |
w, f, d = fmin_l_bfgs_b(f, x0=coef0, fprime=fprime, pgtol=self.tol, | |
bounds=[(0, None)] * n_features * 2) | |
self.coef_ = w[:n_features] - w[n_features:] | |
return self | |
def predict(self, X): | |
return safe_sparse_dot(X, self.coef_) | |
if __name__ == '__main__': | |
from scipy.spatial.distance import jaccard | |
from sklearn.linear_model import Lasso | |
from time import time | |
# Generate data with known sparsity pattern | |
X = np.random.randn(100, 80) | |
X /= np.linalg.norm(X, axis=1)[:, np.newaxis] | |
true_coef = np.zeros(80) | |
nonzero_idx = np.random.randint(80, size=20) | |
true_coef[nonzero_idx] = np.random.randn(20) | |
y = np.dot(X, true_coef) + np.random.randn(100) * 0.01 | |
# Run this solver | |
t0 = time() | |
lasso_1 = LbfgsLasso(alpha=0.001, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("LBFGSB: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_1.score(X, y), | |
np.linalg.norm(lasso_1.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_1.coef_ > 0))) | |
# Run coordinate descent Lasso from scikit-learn | |
t0 = time() | |
lasso_2 = Lasso(alpha=0.001, fit_intercept=False, tol=1e-8).fit(X, y) | |
t0 = time() - t0 | |
print("CD: time = {:.4f}s R2 = {:.8f} ||w - w_true|| = {:.6f} " | |
"Jacc. sparsity = {:.2f}".format(t0, lasso_2.score(X, y), | |
np.linalg.norm(lasso_2.coef_ - true_coef), | |
jaccard(true_coef > 0, lasso_2.coef_ > 0))) |
Hey @MLnick I'm also wondering that. Do you know the answer already?
I believe the loss function in LbfgsL1Logistic is incorrect. I didn't look at the other class. I can't speak for the gradient aspect of the calculation.
Loss for logistic regression includes terms for both the positive and negative class:
h = safe_sparse_dot(X,w)
(y * np.log(h + eps)) + ((1-y) * np.log(1-h + eps))
... or something similar.
Also, the user may or may not wish to include a bias term (perhaps this could be achieved by mean-centering y prior to fitting, or including bias in the calculation of h per this comment)
Oops sorry I realize I missed some very old questions on here!
@MLnick @jpvcaetano: I think you're correct and you could include an L2 penalty leading to elastic net, but don't, there are much better implementations, this is a proof of concept.
@kyle-pena-nlp I think you missed that in this code y is in {-1, 1} rather than {0, 1} (see line 86). So we have that:
P(Y=positive class} = sigmoid(h)
P(Y=negative class} = sigmoid(-h)
so log P(Y=y) = log_sigmoid(yh).
I agree that a bias is useful in practice, but also you wouldn't want to use this in practice, right? (if you do -- why?)
vene, you are correct. I missed that detail.
All the best!
@vene this is cool - just wanted to double check, one could do elastic net with uncommenting https://gist.github.com/vene/fab06038a00309569cdd#file-lbfgs_l1logistic-py-L24 and setting the relative elastic net weights for regularization param?