Created
February 16, 2018 13:14
-
-
Save mangecoeur/f7c419506bb009d69c20565e2b6f7887 to your computer and use it in GitHub Desktop.
Low memory, good performance implementation of k-means silhouette samples using numba
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.utils import check_X_y | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics.cluster.unsupervised import check_number_of_labels | |
from numba import jit | |
@jit(nogil=True, parallel=True) | |
def euclidean_distances_numba(X, Y=None, Y_norm_squared=None): | |
# disable checks | |
XX_ = (X * X).sum(axis=1) | |
XX = XX_.reshape((1, -1)) | |
if X is Y: # shortcut in the common case euclidean_distances(X, X) | |
YY = XX.T | |
elif Y_norm_squared is not None: | |
YY = Y_norm_squared | |
else: | |
YY_ = np.sum(Y * Y, axis=1) | |
YY = YY_.reshape((1,-1)) | |
distances = np.dot(X, Y.T) | |
distances *= -2 | |
distances += XX | |
distances += YY | |
distances = np.maximum(distances, 0) | |
return np.sqrt(distances) | |
@jit(parallel=True) | |
def euclidean_distances_sum(X, Y=None): | |
if Y is None: | |
Y = X | |
Y_norm_squared = (Y ** 2).sum(axis=1) | |
sums = np.zeros((len(X))) | |
for i in range(len(X)): | |
base_row = X[i, :] | |
sums[i] = euclidean_distances_numba(base_row.reshape(1, -1), Y, Y_norm_squared=Y_norm_squared).sum() | |
return sums | |
@jit(parallel=True) | |
def euclidean_distances_mean(X, Y=None): | |
if Y is None: | |
Y = X | |
Y_norm_squared = (Y ** 2).sum(axis=1) | |
means = np.zeros((len(X))) | |
for i in range(len(X)): | |
base_row = X[i, :] | |
means[i] = euclidean_distances_numba(base_row.reshape(1, -1), Y, Y_norm_squared=Y_norm_squared).mean() | |
return means | |
def silhouette_samples_memory_saving(X, labels, metric='euclidean', **kwds): | |
X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) | |
le = LabelEncoder() | |
labels = le.fit_transform(labels) | |
check_number_of_labels(len(le.classes_), X.shape[0]) | |
unique_labels = le.classes_ | |
n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) | |
# For sample i, store the mean distance of the cluster to which | |
# it belongs in intra_clust_dists[i] | |
intra_clust_dists = np.zeros(X.shape[0], dtype=X.dtype) | |
# For sample i, store the mean distance of the second closest | |
# cluster in inter_clust_dists[i] | |
inter_clust_dists = np.inf + intra_clust_dists | |
for curr_label in range(len(unique_labels)): | |
# Find inter_clust_dist for all samples belonging to the same label. | |
mask = labels == curr_label | |
# Leave out current sample. | |
n_samples_curr_lab = n_samples_per_label[curr_label] - 1 | |
if n_samples_curr_lab != 0: | |
intra_clust_dists[mask] = euclidean_distances_sum(X[mask, :]) / n_samples_curr_lab | |
# Now iterate over all other labels, finding the mean | |
# cluster distance that is closest to every sample. | |
for other_label in range(len(unique_labels)): | |
if other_label != curr_label: | |
other_mask = labels == other_label | |
other_distances = euclidean_distances_mean(X[mask, :], X[other_mask, :]) | |
inter_clust_dists[mask] = np.minimum(inter_clust_dists[mask], other_distances) | |
sil_samples = inter_clust_dists - intra_clust_dists | |
sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) | |
# score 0 for clusters of size 1, according to the paper | |
sil_samples[n_samples_per_label.take(labels) == 1] = 0 | |
return sil_samples |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! I'm trying to use this module in a Jupyter Notebook, but I keep having name issues, seemingly around the usage of numpy as np. Is it supposed to be used without importing numpy anywhere within the module (that is, assuming the environment in which I'm using it already imported numpy as np)?