-
-
Save vene/3e3aeb1bbe960703f052 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
(C) August 2013, Mathieu Blondel | |
# License: BSD 3 clause | |
Custom group support by Vlad Niculae ([email protected]) | |
This is a Numba-based reimplementation of the block coordinate descent solver | |
(without line search) described in the paper: | |
Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass | |
Classification. Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara. | |
Machine Learning, May 2013. | |
The reference Cython implementation is avaible from the "primal_cd" module in: | |
https://github.com/mblondel/lightning | |
The reference Cython implementation appears to be roughly 2x faster than this | |
implementation. | |
""" | |
from __future__ import print_function | |
from numba import jit | |
import numpy as np | |
import scipy.sparse as sp | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.utils.extmath import safe_sparse_dot | |
from sklearn.utils import check_random_state | |
@jit("void(f8[:], i4[:], i4[:], i4[:], f8, f8[:])") | |
def _inv_step_sizes(X_data, X_indptr, group_ids, group_indptr, scale, out): | |
"""Compute the block-wise inverse step sizes (Lipschitz constants).""" | |
for group_no in range(len(group_indptr) - 1): | |
group = group_ids[group_indptr[group_no]:group_indptr[group_no + 1]] | |
sqnorm = 0 | |
for j in group: | |
for k in xrange(X_indptr[j], X_indptr[j + 1]): | |
sqnorm += X_data[k] * X_data[k] | |
out[group_no] = scale * sqnorm | |
@jit("void(f8[:], i4[:], i4[:], i4[:], f8[:,:], i4[:], f8, f8[:])") | |
def _grad(X_data, X_indices, X_indptr, y, A, group, C, out): | |
"""Compute the partial gradient for the j^th block | |
(vector of size n_classes x group_size).""" | |
n_classes = out.shape[0] | |
for feat_no, j in enumerate(group): | |
for r in xrange(n_classes): | |
for k in xrange(X_indptr[j], X_indptr[j + 1]): | |
i = X_indices[k] | |
if y[i] == r: | |
continue | |
if A[r, i] > 0: | |
update = 2 * C * A[r, i] * X_data[k] | |
out[y[i], feat_no] -= update | |
out[r, feat_no] += update | |
@jit("void(f8[:], i4[:], f8, f8, f8[:], f8[:, :], f8)") | |
def _update_coef(grad, group, step_size, alpha, update, coef, tol): | |
"""Update the j^th block of the coefficient matrix.""" | |
n_classes = grad.shape[0] | |
update_norm = 0 | |
for feat_no, j in enumerate(group): | |
for r in xrange(n_classes): | |
update[r, feat_no] = coef[r, j] - step_size * grad[r, feat_no] | |
update_norm += update[r, feat_no] * update[r, feat_no] | |
update_norm = np.sqrt(update_norm) | |
if update_norm < tol: | |
scale = 0 | |
else: | |
mu = alpha * step_size | |
scale = 1 - mu / update_norm | |
if scale < 0: | |
scale = 0 | |
for feat_no, j in enumerate(group): | |
for r in xrange(n_classes): | |
old = coef[r, j] | |
coef[r, j] = scale * update[r, feat_no] | |
update[r, feat_no] = coef[r, j] - old | |
@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], f8[:], f8[:, :])") | |
def _update_A(X_data, X_indices, X_indptr, y, group, update, A): | |
"""Update matrix A (see paper).""" | |
n_classes = A.shape[0] | |
for feat_no, j in enumerate(group): | |
for r in xrange(n_classes): | |
for k in xrange(X_indptr[j], X_indptr[j + 1]): | |
i = X_indices[k] | |
if y[i] == r: | |
continue | |
A[r, i] += (update[r, feat_no] - update[y[i], feat_no]) * X_data[k] | |
@jit("f8(f8[:], f8[:], i4[:], f8)") | |
def _violation(grad, coef, group, alpha): | |
"""Compute optimality violation for the j^th block.""" | |
n_classes = grad.shape[0] | |
coef_norm = 0 | |
grad_norm = 0 | |
for feat_no, j in enumerate(group): | |
for r in xrange(n_classes): | |
coef_norm += coef[r, j] * coef[r, j] | |
grad_norm += grad[r, feat_no] * grad[r, feat_no] | |
grad_norm = np.sqrt(grad_norm) | |
if coef_norm == 0: | |
violation = max(grad_norm - alpha, 0) | |
else: | |
violation = np.abs(grad_norm - alpha) | |
return violation | |
@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], i4[:], i4, f8, f8, f8, i4, f8[:,:])") | |
def _fit(X_data, X_indices, X_indptr, y, group_ids, group_indptr, max_iter, | |
alpha, C, tol, verbose, coef): | |
n_samples = y.shape[0] | |
n_groups = group_indptr.shape[0] - 1 | |
n_classes, n_features = coef.shape | |
inv_step_sizes = np.zeros(n_groups, dtype=np.float64) | |
_inv_step_sizes(X_data, X_indptr, group_ids, group_indptr, | |
C * 4 * (n_classes - 1), | |
inv_step_sizes) | |
grad = np.empty((n_classes, 10), dtype=np.float64) | |
update = np.zeros((n_classes, 10), dtype=np.float64) | |
A = np.ones((n_classes, n_samples), dtype=np.float64) | |
rs = np.random.RandomState(None) | |
violation_init = 0 | |
for it in xrange(max_iter): | |
violation_max = 0 | |
for _ in xrange(n_groups): | |
j = rs.randint(n_groups - 1) | |
group = group_ids[group_indptr[j]:group_indptr[j + 1]] | |
if inv_step_sizes[j] == 0: | |
continue | |
grad.fill(0) | |
_grad(X_data, X_indices, X_indptr, y, A, group, C, grad) | |
violation = _violation(grad, coef, group, alpha) | |
_update_coef(grad, group, 1. / inv_step_sizes[j], alpha, | |
update, coef, tol) | |
_update_A(X_data, X_indices, X_indptr, y, group, update, A) | |
if violation > violation_max: | |
violation_max = violation | |
if it == 0: | |
violation_init = violation_max | |
if violation_max == violation_init == 0: | |
ratio = 0.0 | |
else: | |
ratio = violation_max / violation_init | |
if verbose >= 1: | |
print("Violation {:.2f}".format(ratio)) | |
if ratio < tol: | |
if verbose >= 1: | |
print("Converged at iter", it + 1) | |
break | |
def make_group_classification(n_samples=200, group_size=10, | |
n_classes=2, random_state=None): | |
""" Make a classification dataset for sparse group lasso | |
Implements Sec. 4 from "A note on the group lasso and a sparse group | |
lasso", by Friedman et al. | |
""" | |
rng = check_random_state(random_state) | |
n_nonzero = [10, 8, 6, 4, 2, 1, 0, 0, 0, 0] | |
n_features = len(n_nonzero) * group_size | |
X = rng.randn(n_samples, n_features) | |
coef = np.zeros((n_features, n_classes)) | |
group_starts = range(0, n_features, group_size) | |
for start, this_n_nonzero in zip(group_starts, n_nonzero): | |
for klass in range(n_classes): | |
idx = np.arange(10) | |
rng.shuffle(idx) | |
coef[start:, klass][idx[:this_n_nonzero]] = rng.choice( | |
[-1, +1], | |
size=this_n_nonzero) | |
y = np.argmax(np.dot(X, coef), axis=1) | |
return X, y, group_starts | |
class SparseMulticlassClassifier(BaseEstimator, ClassifierMixin): | |
def __init__(self, alpha=1, C=1, max_iter=20, tol=0.05, verbose=0): | |
self.alpha = alpha | |
self.C = C | |
self.max_iter = max_iter | |
self.tol = tol | |
self.verbose = verbose | |
def fit(self, X, y, groups=None): | |
X = sp.csc_matrix(X) | |
n_samples, n_features = X.shape | |
self._enc = LabelEncoder() | |
y = self._enc.fit_transform(y).astype(np.int32) | |
n_classes = len(self._enc.classes_) | |
self.coef_ = np.zeros((n_classes, n_features), dtype=np.float64) | |
group_ids = np.hstack(groups) | |
group_offset = np.cumsum(np.array([0] + [len(k) for k in groups])) | |
_fit(X.data, X.indices, X.indptr, y, group_ids, group_offset, | |
self.max_iter, self.alpha, self.C, self.tol, self.verbose, | |
self.coef_) | |
return self | |
def decision_function(self, X): | |
return safe_sparse_dot(X, self.coef_.T) | |
def predict(self, X): | |
pred = self.decision_function(X) | |
pred = np.argmax(pred, axis=1) | |
return self._enc.inverse_transform(pred) | |
def n_nonzero(self, percentage=False): | |
n_nz = np.sum(np.sum(self.coef_ != 0, axis=0, dtype=bool)) | |
if percentage: | |
n_nz /= float(self.coef_.shape[1]) | |
return n_nz | |
if __name__ == '__main__': | |
import time | |
X, y, group_starts = make_group_classification(group_size=10, n_classes=3) | |
groups = [range(k, k + 10) for k in group_starts] | |
X = sp.csr_matrix(X) | |
print(X.shape, y.shape) | |
s = time.time() | |
clf_ = SparseMulticlassClassifier(C=0.8 / X.shape[0], alpha=1, tol=1e-3, | |
max_iter=50, verbose=1) | |
clf_.fit(X, y, groups) | |
training_time = time.time() - s | |
print("Numba") | |
print(training_time) | |
print(clf_.score(X, y)) | |
print(clf_.n_nonzero(percentage=True)) | |
print() | |
from lightning.classification import CDClassifier | |
clf = CDClassifier(C=0.1 / X.shape[0], alpha=0.1, tol=1e-3, max_iter=100, | |
multiclass=True, penalty="l1/l2", shrinking=False, | |
max_steps=0, selection="uniform", verbose=0) | |
s = time.time() | |
clf.fit(X, y) | |
training_time = time.time() - s | |
print("Cython") | |
print(training_time) | |
print(clf.score(X, y)) | |
print(clf.n_nonzero(percentage=True)) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment