-
-
Save mblondel/6230787 to your computer and use it in GitHub Desktop.
"""Kernel K-means""" | |
# Author: Mathieu Blondel <[email protected]> | |
# License: BSD 3 clause | |
import numpy as np | |
from sklearn.base import BaseEstimator, ClusterMixin | |
from sklearn.metrics.pairwise import pairwise_kernels | |
from sklearn.utils import check_random_state | |
class KernelKMeans(BaseEstimator, ClusterMixin): | |
""" | |
Kernel K-means | |
Reference | |
--------- | |
Kernel k-means, Spectral Clustering and Normalized Cuts. | |
Inderjit S. Dhillon, Yuqiang Guan, Brian Kulis. | |
KDD 2004. | |
""" | |
def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None, | |
kernel="linear", gamma=None, degree=3, coef0=1, | |
kernel_params=None, verbose=0): | |
self.n_clusters = n_clusters | |
self.max_iter = max_iter | |
self.tol = tol | |
self.random_state = random_state | |
self.kernel = kernel | |
self.gamma = gamma | |
self.degree = degree | |
self.coef0 = coef0 | |
self.kernel_params = kernel_params | |
self.verbose = verbose | |
@property | |
def _pairwise(self): | |
return self.kernel == "precomputed" | |
def _get_kernel(self, X, Y=None): | |
if callable(self.kernel): | |
params = self.kernel_params or {} | |
else: | |
params = {"gamma": self.gamma, | |
"degree": self.degree, | |
"coef0": self.coef0} | |
return pairwise_kernels(X, Y, metric=self.kernel, | |
filter_params=True, **params) | |
def fit(self, X, y=None, sample_weight=None): | |
n_samples = X.shape[0] | |
K = self._get_kernel(X) | |
sw = sample_weight if sample_weight else np.ones(n_samples) | |
self.sample_weight_ = sw | |
rs = check_random_state(self.random_state) | |
self.labels_ = rs.randint(self.n_clusters, size=n_samples) | |
dist = np.zeros((n_samples, self.n_clusters)) | |
self.within_distances_ = np.zeros(self.n_clusters) | |
for it in xrange(self.max_iter): | |
dist.fill(0) | |
self._compute_dist(K, dist, self.within_distances_, | |
update_within=True) | |
labels_old = self.labels_ | |
self.labels_ = dist.argmin(axis=1) | |
# Compute the number of samples whose cluster did not change | |
# since last iteration. | |
n_same = np.sum((self.labels_ - labels_old) == 0) | |
if 1 - float(n_same) / n_samples < self.tol: | |
if self.verbose: | |
print "Converged at iteration", it + 1 | |
break | |
self.X_fit_ = X | |
return self | |
def _compute_dist(self, K, dist, within_distances, update_within): | |
"""Compute a n_samples x n_clusters distance matrix using the | |
kernel trick.""" | |
sw = self.sample_weight_ | |
for j in xrange(self.n_clusters): | |
mask = self.labels_ == j | |
if np.sum(mask) == 0: | |
raise ValueError("Empty cluster found, try smaller n_cluster.") | |
denom = sw[mask].sum() | |
denomsq = denom * denom | |
if update_within: | |
KK = K[mask][:, mask] # K[mask, mask] does not work. | |
dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq) | |
within_distances[j] = dist_j | |
dist[:, j] += dist_j | |
else: | |
dist[:, j] += within_distances[j] | |
dist[:, j] -= 2 * np.sum(sw[mask] * K[:, mask], axis=1) / denom | |
def predict(self, X): | |
K = self._get_kernel(X, self.X_fit_) | |
n_samples = X.shape[0] | |
dist = np.zeros((n_samples, self.n_clusters)) | |
self._compute_dist(K, dist, self.within_distances_, | |
update_within=False) | |
return dist.argmin(axis=1) | |
if __name__ == '__main__': | |
from sklearn.datasets import make_blobs | |
X, y = make_blobs(n_samples=1000, centers=5, random_state=0) | |
km = KernelKMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1) | |
print km.fit_predict(X)[:10] | |
print km.predict(X[:10]) |
No, I'm too lazy to write documentation and examples :b Feel free to reuse this gist and send us a pull-request!
Any advice on how to change the kernel to our own formula? Wanting a triangular sine kernel.
How did this compare to spectral clustering in practice? Do you think it is worth adding to scikit-learn?
@amueller Sorry for late reply (why no notifcations in gists?!). I haven't compared it to other algorithms but I am open to inclusion in scikit-learn if someone wants to work on it.
@mblondel , how related (or not) is this to Graclus? [ http://www.cs.utexas.edu/users/dml/Software/graclus.html ] Being able to work with an out-of-core kernel matrix would be a fantastic addition to scikit-learn.
Bonjour, est-il possible que votre algorithme reçois comme paramètre une matrice de distance au lieu d'une matrice(N_samples,N_Features).
Si oui, quel paramètre permet de spécifier le type de la matrice d'entré. merci
it gives me the same result of scikit kmeans. How is it possible? I have set kernel='rbf'
This is great! Many thanks
What other kernels can we use? Could you give some info on how to use different kernels?
Is there a way to make this not fail when an empty cluster is found?
How can I use kmeans++ using the same code?
Important not for everyone:
If You are looking for "gaussian kernel" just pass 'rbf' as "metric".
Sometimes I forget it.
Any advice on how to extend this to multi-kernel k-means?
The code was written for Python 2 and you're using Python 3. Replace xrange
by range
and print ...
by print(...)
.
Sure, go ahead!
How to get the cluster centers, any idea?
Hi. Great code, did you try sending a pull request to scikit-learn?