-
-
Save mblondel/6230787 to your computer and use it in GitHub Desktop.
| """Kernel K-means""" | |
| # Author: Mathieu Blondel <[email protected]> | |
| # License: BSD 3 clause | |
| import numpy as np | |
| from sklearn.base import BaseEstimator, ClusterMixin | |
| from sklearn.metrics.pairwise import pairwise_kernels | |
| from sklearn.utils import check_random_state | |
| class KernelKMeans(BaseEstimator, ClusterMixin): | |
| """ | |
| Kernel K-means | |
| Reference | |
| --------- | |
| Kernel k-means, Spectral Clustering and Normalized Cuts. | |
| Inderjit S. Dhillon, Yuqiang Guan, Brian Kulis. | |
| KDD 2004. | |
| """ | |
| def __init__(self, n_clusters=3, max_iter=50, tol=1e-3, random_state=None, | |
| kernel="linear", gamma=None, degree=3, coef0=1, | |
| kernel_params=None, verbose=0): | |
| self.n_clusters = n_clusters | |
| self.max_iter = max_iter | |
| self.tol = tol | |
| self.random_state = random_state | |
| self.kernel = kernel | |
| self.gamma = gamma | |
| self.degree = degree | |
| self.coef0 = coef0 | |
| self.kernel_params = kernel_params | |
| self.verbose = verbose | |
| @property | |
| def _pairwise(self): | |
| return self.kernel == "precomputed" | |
| def _get_kernel(self, X, Y=None): | |
| if callable(self.kernel): | |
| params = self.kernel_params or {} | |
| else: | |
| params = {"gamma": self.gamma, | |
| "degree": self.degree, | |
| "coef0": self.coef0} | |
| return pairwise_kernels(X, Y, metric=self.kernel, | |
| filter_params=True, **params) | |
| def fit(self, X, y=None, sample_weight=None): | |
| n_samples = X.shape[0] | |
| K = self._get_kernel(X) | |
| sw = sample_weight if sample_weight else np.ones(n_samples) | |
| self.sample_weight_ = sw | |
| rs = check_random_state(self.random_state) | |
| self.labels_ = rs.randint(self.n_clusters, size=n_samples) | |
| dist = np.zeros((n_samples, self.n_clusters)) | |
| self.within_distances_ = np.zeros(self.n_clusters) | |
| for it in xrange(self.max_iter): | |
| dist.fill(0) | |
| self._compute_dist(K, dist, self.within_distances_, | |
| update_within=True) | |
| labels_old = self.labels_ | |
| self.labels_ = dist.argmin(axis=1) | |
| # Compute the number of samples whose cluster did not change | |
| # since last iteration. | |
| n_same = np.sum((self.labels_ - labels_old) == 0) | |
| if 1 - float(n_same) / n_samples < self.tol: | |
| if self.verbose: | |
| print "Converged at iteration", it + 1 | |
| break | |
| self.X_fit_ = X | |
| return self | |
| def _compute_dist(self, K, dist, within_distances, update_within): | |
| """Compute a n_samples x n_clusters distance matrix using the | |
| kernel trick.""" | |
| sw = self.sample_weight_ | |
| for j in xrange(self.n_clusters): | |
| mask = self.labels_ == j | |
| if np.sum(mask) == 0: | |
| raise ValueError("Empty cluster found, try smaller n_cluster.") | |
| denom = sw[mask].sum() | |
| denomsq = denom * denom | |
| if update_within: | |
| KK = K[mask][:, mask] # K[mask, mask] does not work. | |
| dist_j = np.sum(np.outer(sw[mask], sw[mask]) * KK / denomsq) | |
| within_distances[j] = dist_j | |
| dist[:, j] += dist_j | |
| else: | |
| dist[:, j] += within_distances[j] | |
| dist[:, j] -= 2 * np.sum(sw[mask] * K[:, mask], axis=1) / denom | |
| def predict(self, X): | |
| K = self._get_kernel(X, self.X_fit_) | |
| n_samples = X.shape[0] | |
| dist = np.zeros((n_samples, self.n_clusters)) | |
| self._compute_dist(K, dist, self.within_distances_, | |
| update_within=False) | |
| return dist.argmin(axis=1) | |
| if __name__ == '__main__': | |
| from sklearn.datasets import make_blobs | |
| X, y = make_blobs(n_samples=1000, centers=5, random_state=0) | |
| km = KernelKMeans(n_clusters=5, max_iter=100, random_state=0, verbose=1) | |
| print km.fit_predict(X)[:10] | |
| print km.predict(X[:10]) |
No, I'm too lazy to write documentation and examples :b Feel free to reuse this gist and send us a pull-request!
Any advice on how to change the kernel to our own formula? Wanting a triangular sine kernel.
How did this compare to spectral clustering in practice? Do you think it is worth adding to scikit-learn?
@amueller Sorry for late reply (why no notifcations in gists?!). I haven't compared it to other algorithms but I am open to inclusion in scikit-learn if someone wants to work on it.
@mblondel , how related (or not) is this to Graclus? [ http://www.cs.utexas.edu/users/dml/Software/graclus.html ] Being able to work with an out-of-core kernel matrix would be a fantastic addition to scikit-learn.
Bonjour, est-il possible que votre algorithme reçois comme paramètre une matrice de distance au lieu d'une matrice(N_samples,N_Features).
Si oui, quel paramètre permet de spécifier le type de la matrice d'entré. merci
it gives me the same result of scikit kmeans. How is it possible? I have set kernel='rbf'
This is great! Many thanks
What other kernels can we use? Could you give some info on how to use different kernels?
Is there a way to make this not fail when an empty cluster is found?
How can I use kmeans++ using the same code?
Important not for everyone:
If You are looking for "gaussian kernel" just pass 'rbf' as "metric".
Sometimes I forget it.
Any advice on how to extend this to multi-kernel k-means?
The code was written for Python 2 and you're using Python 3. Replace xrange by range and print ... by print(...).
Sure, go ahead!
How to get the cluster centers, any idea?
Hi. Great code, did you try sending a pull request to scikit-learn?