-
-
Save AlexandreAbraham/5544803 to your computer and use it in GitHub Desktop.
| """ Unsupervised evaluation metrics. """ | |
| # License: BSD Style. | |
| from itertools import combinations | |
| import numpy as np | |
| from sklearn.utils import check_random_state | |
| from sklearn.metrics.pairwise import distance_metrics | |
| from sklearn.metrics.pairwise import pairwise_distances | |
| from sklearn.externals.joblib import Parallel, delayed | |
| def silhouette_score_slow(X, labels, metric='euclidean', sample_size=None, | |
| random_state=None, **kwds): | |
| """Compute the mean Silhouette Coefficient of all samples. | |
| This method is computationally expensive compared to the reference one. | |
| The Silhouette Coefficient is calculated using the mean intra-cluster | |
| distance (a) and the mean nearest-cluster distance (b) for each sample. | |
| The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. | |
| To clarrify, b is the distance between a sample and the nearest cluster | |
| that b is not a part of. | |
| This function returns the mean Silhoeutte Coefficient over all samples. | |
| To obtain the values for each sample, use silhouette_samples | |
| The best value is 1 and the worst value is -1. Values near 0 indicate | |
| overlapping clusters. Negative values generally indicate that a sample has | |
| been assigned to the wrong cluster, as a different cluster is more similar. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| sample_size : int or None | |
| The size of the sample to use when computing the Silhouette | |
| Coefficient. If sample_size is None, no sampling is used. | |
| random_state : integer or numpy.RandomState, optional | |
| The generator used to initialize the centers. If an integer is | |
| given, it fixes the seed. Defaults to the global numpy random | |
| number generator. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| Returns | |
| ------- | |
| silhouette : float | |
| Mean Silhouette Coefficient for all samples. | |
| References | |
| ---------- | |
| Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the | |
| Interpretation and Validation of Cluster Analysis". Computational | |
| and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. | |
| http://en.wikipedia.org/wiki/Silhouette_(clustering) | |
| """ | |
| if sample_size is not None: | |
| random_state = check_random_state(random_state) | |
| indices = random_state.permutation(X.shape[0])[:sample_size] | |
| if metric == "precomputed": | |
| raise ValueError('Distance matrix cannot be precomputed') | |
| else: | |
| X, labels = X[indices], labels[indices] | |
| return np.mean(silhouette_samples_slow(X, labels, metric=metric, **kwds)) | |
| def silhouette_samples_slow(X, labels, metric='euclidean', **kwds): | |
| """Compute the Silhouette Coefficient for each sample. | |
| The Silhoeutte Coefficient is a measure of how well samples are clustered | |
| with samples that are similar to themselves. Clustering models with a high | |
| Silhouette Coefficient are said to be dense, where samples in the same | |
| cluster are similar to each other, and well separated, where samples in | |
| different clusters are not very similar to each other. | |
| The Silhouette Coefficient is calculated using the mean intra-cluster | |
| distance (a) and the mean nearest-cluster distance (b) for each sample. | |
| The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. | |
| This function returns the Silhoeutte Coefficient for each sample. | |
| The best value is 1 and the worst value is -1. Values near 0 indicate | |
| overlapping clusters. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| Returns | |
| ------- | |
| silhouette : array, shape = [n_samples] | |
| Silhouette Coefficient for each samples. | |
| References | |
| ---------- | |
| Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the | |
| Interpretation and Validation of Cluster Analysis". Computational | |
| and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. | |
| http://en.wikipedia.org/wiki/Silhouette_(clustering) | |
| """ | |
| metric = distance_metrics()[metric] | |
| n = labels.shape[0] | |
| A = np.array([_intra_cluster_distance_slow(X, labels, metric, i) | |
| for i in range(n)]) | |
| B = np.array([_nearest_cluster_distance_slow(X, labels, metric, i) | |
| for i in range(n)]) | |
| sil_samples = (B - A) / np.maximum(A, B) | |
| # nan values are for clusters of size 1, and should be 0 | |
| return np.nan_to_num(sil_samples) | |
| def _intra_cluster_distance_slow(X, labels, metric, i): | |
| """Calculate the mean intra-cluster distance for sample i. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric: function | |
| Pairwise metric function | |
| i : int | |
| Sample index being calculated. It is excluded from calculation and | |
| used to determine the current label | |
| Returns | |
| ------- | |
| a : float | |
| Mean intra-cluster distance for sample i | |
| """ | |
| indices = np.where(labels == labels[i])[0] | |
| if len(indices) == 0: | |
| return 0. | |
| a = np.mean([metric(X[i], X[j]) for j in indices if not i == j]) | |
| return a | |
| def _nearest_cluster_distance_slow(X, labels, metric, i): | |
| """Calculate the mean nearest-cluster distance for sample i. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric: function | |
| Pairwise metric function | |
| i : int | |
| Sample index being calculated. It is used to determine the current | |
| label. | |
| Returns | |
| ------- | |
| b : float | |
| Mean nearest-cluster distance for sample i | |
| """ | |
| label = labels[i] | |
| b = np.min( | |
| [np.mean( | |
| [metric(X[i], X[j]) for j in np.where(labels == cur_label)[0]] | |
| ) for cur_label in set(labels) if not cur_label == label]) | |
| return b | |
| def silhouette_score_block(X, labels, metric='euclidean', sample_size=None, | |
| random_state=None, n_jobs=1, **kwds): | |
| """Compute the mean Silhouette Coefficient of all samples. | |
| The Silhouette Coefficient is calculated using the mean intra-cluster | |
| distance (a) and the mean nearest-cluster distance (b) for each sample. | |
| The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. | |
| To clarrify, b is the distance between a sample and the nearest cluster | |
| that b is not a part of. | |
| This function returns the mean Silhoeutte Coefficient over all samples. | |
| To obtain the values for each sample, use silhouette_samples | |
| The best value is 1 and the worst value is -1. Values near 0 indicate | |
| overlapping clusters. Negative values generally indicate that a sample has | |
| been assigned to the wrong cluster, as a different cluster is more similar. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| sample_size : int or None | |
| The size of the sample to use when computing the Silhouette | |
| Coefficient. If sample_size is None, no sampling is used. | |
| random_state : integer or numpy.RandomState, optional | |
| The generator used to initialize the centers. If an integer is | |
| given, it fixes the seed. Defaults to the global numpy random | |
| number generator. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| Returns | |
| ------- | |
| silhouette : float | |
| Mean Silhouette Coefficient for all samples. | |
| References | |
| ---------- | |
| Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the | |
| Interpretation and Validation of Cluster Analysis". Computational | |
| and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. | |
| http://en.wikipedia.org/wiki/Silhouette_(clustering) | |
| """ | |
| if sample_size is not None: | |
| random_state = check_random_state(random_state) | |
| indices = random_state.permutation(X.shape[0])[:sample_size] | |
| if metric == "precomputed": | |
| raise ValueError('Distance matrix cannot be precomputed') | |
| else: | |
| X, labels = X[indices], labels[indices] | |
| return np.mean(silhouette_samples_block( | |
| X, labels, metric=metric, n_jobs=n_jobs, **kwds)) | |
| def silhouette_samples_block(X, labels, metric='euclidean', n_jobs=1, **kwds): | |
| """Compute the Silhouette Coefficient for each sample. | |
| The Silhoeutte Coefficient is a measure of how well samples are clustered | |
| with samples that are similar to themselves. Clustering models with a high | |
| Silhouette Coefficient are said to be dense, where samples in the same | |
| cluster are similar to each other, and well separated, where samples in | |
| different clusters are not very similar to each other. | |
| The Silhouette Coefficient is calculated using the mean intra-cluster | |
| distance (a) and the mean nearest-cluster distance (b) for each sample. | |
| The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. | |
| This function returns the Silhoeutte Coefficient for each sample. | |
| The best value is 1 and the worst value is -1. Values near 0 indicate | |
| overlapping clusters. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| Returns | |
| ------- | |
| silhouette : array, shape = [n_samples] | |
| Silhouette Coefficient for each samples. | |
| References | |
| ---------- | |
| Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the | |
| Interpretation and Validation of Cluster Analysis". Computational | |
| and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7. | |
| http://en.wikipedia.org/wiki/Silhouette_(clustering) | |
| """ | |
| A = _intra_cluster_distances_block(X, labels, metric, n_jobs=n_jobs, | |
| **kwds) | |
| B = _nearest_cluster_distance_block(X, labels, metric, n_jobs=n_jobs, | |
| **kwds) | |
| sil_samples = (B - A) / np.maximum(A, B) | |
| # nan values are for clusters of size 1, and should be 0 | |
| return np.nan_to_num(sil_samples) | |
| def _intra_cluster_distances_block_(subX, metric, **kwds): | |
| distances = pairwise_distances(subX, metric=metric, **kwds) | |
| return distances.sum(axis=1) / (distances.shape[0] - 1) | |
| def _intra_cluster_distances_block(X, labels, metric, n_jobs=1, **kwds): | |
| """Calculate the mean intra-cluster distance for sample i. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| Returns | |
| ------- | |
| a : array [n_samples_a] | |
| Mean intra-cluster distance | |
| """ | |
| intra_dist = np.zeros(labels.size, dtype=float) | |
| values = Parallel(n_jobs=n_jobs)( | |
| delayed(_intra_cluster_distances_block_) | |
| (X[np.where(labels == label)[0]], metric, **kwds) | |
| for label in np.unique(labels)) | |
| for label, values_ in zip(np.unique(labels), values): | |
| intra_dist[np.where(labels == label)[0]] = values_ | |
| return intra_dist | |
| def _nearest_cluster_distance_block_(subX_a, subX_b, metric, **kwds): | |
| dist = pairwise_distances(subX_a, subX_b, metric=metric, **kwds) | |
| dist_a = dist.mean(axis=1) | |
| dist_b = dist.mean(axis=0) | |
| return dist_a, dist_b | |
| def _nearest_cluster_distance_block(X, labels, metric, n_jobs=1, **kwds): | |
| """Calculate the mean nearest-cluster distance for sample i. | |
| Parameters | |
| ---------- | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| labels : array, shape = [n_samples] | |
| label values for each sample | |
| metric : string, or callable | |
| The metric to use when calculating distance between instances in a | |
| feature array. If metric is a string, it must be one of the options | |
| allowed by metrics.pairwise.pairwise_distances. If X is the distance | |
| array itself, use "precomputed" as the metric. | |
| `**kwds` : optional keyword parameters | |
| Any further parameters are passed directly to the distance function. | |
| If using a scipy.spatial.distance metric, the parameters are still | |
| metric dependent. See the scipy docs for usage examples. | |
| X : array [n_samples_a, n_features] | |
| Feature array. | |
| Returns | |
| ------- | |
| b : float | |
| Mean nearest-cluster distance for sample i | |
| """ | |
| inter_dist = np.empty(labels.size, dtype=float) | |
| inter_dist.fill(np.inf) | |
| # Compute cluster distance between pairs of clusters | |
| unique_labels = np.unique(labels) | |
| values = Parallel(n_jobs=n_jobs)( | |
| delayed(_nearest_cluster_distance_block_)( | |
| X[np.where(labels == label_a)[0]], | |
| X[np.where(labels == label_b)[0]], | |
| metric, **kwds) | |
| for label_a, label_b in combinations(unique_labels, 2)) | |
| for (label_a, label_b), (values_a, values_b) in \ | |
| zip(combinations(unique_labels, 2), values): | |
| indices_a = np.where(labels == label_a)[0] | |
| inter_dist[indices_a] = np.minimum(values_a, inter_dist[indices_a]) | |
| del indices_a | |
| indices_b = np.where(labels == label_b)[0] | |
| inter_dist[indices_b] = np.minimum(values_b, inter_dist[indices_b]) | |
| del indices_b | |
| return inter_dist | |
| if __name__ == '__main__': | |
| import time | |
| from sklearn.metrics.cluster.unsupervised import silhouette_score | |
| np.random.seed(0) | |
| X = np.random.random((10000, 100)) | |
| y = np.repeat(np.arange(100), 100) | |
| t0 = time.time() | |
| s = silhouette_score(X, y) | |
| t = time.time() - t0 | |
| print 'Scikit silhouette (%fs): %f' % (t, s) | |
| t0 = time.time() | |
| s = silhouette_score_block(X, y) | |
| t = time.time() - t0 | |
| print 'Block silhouette (%fs): %f' % (t, s) | |
| t0 = time.time() | |
| s = silhouette_score_block(X, y, n_jobs=2) | |
| t = time.time() - t0 | |
| print 'Block silhouette parallel (%fs): %f' % (t, s) |
For those interested as @AlexandreAbraham stated scikit-learn is faster now:
sklearn 1.0.2: 1.54 s ± 67.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
this impl.: 2.28 s ± 131 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using more jobs helps a bit but still does not beat scikit-learn. Bummer ... I am still looking for something faster when dealing with 1.5B samples.
Hey @paulgavrikov, I did not look hard nor test but there are implementations of silhouette in numba lying around. Could it help?
https://gist.github.com/mangecoeur/f7c419506bb009d69c20565e2b6f7887
@AlexandreAbraham thanks for the link! There's something wrong with the numba part there. I'll try to fix it. Cheers!
@paulgavrikov maybe try using Simplified Silhouette (it is way much faster) and, in general, provides quite good results. You can find a descripition here: https://dl.acm.org/doi/abs/10.1145/2484838.2484844.
This code is a bit old, to say the least. Feel free to use scikit learn, its implementation should be able to scale now, this code is not needed anymore.