Last active
August 31, 2024 18:15
-
-
Save jmquintana79/923989bad9bd0419857c977a01dc8933 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import silhouette_samples | |
from sklearn.cluster import KMeans | |
import matplotlib.pyplot as plt | |
import numpy as np | |
## quantification of clustering quality via silhouette metric | |
def quantification_clustering_quality(X:np.array, y_km:np.array, verbose:bool = False)->np.array: | |
""" | |
Quantification of clustering quality via silhouette metric. | |
X -- Array of features used to estimate the clustering. | |
y_km -- Labels returned by the clustering method to be evaluated. | |
verbose -- Display or not extra information (default, False). | |
return -- Array of statistics (mean, std) for each cluster and total. | |
""" | |
# clusters labels | |
cluster_labels = np.unique(y_km) | |
# number of clusters | |
n_clusters = cluster_labels.shape[0] | |
# estimate silhouette values (for all records) | |
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') | |
# initialize | |
statistics = list() | |
# loop of cluster labels | |
for i, c in enumerate(cluster_labels): | |
# collect silhouette values per cluster | |
c_silhouette_vals = silhouette_vals[y_km == c] | |
# estimate statistics for each cluster | |
statistics.append([f"C{c}", np.mean(c_silhouette_vals), np.std(c_silhouette_vals)]) | |
# display | |
if verbose: | |
print(f"C{c}", np.mean(c_silhouette_vals), np.std(c_silhouette_vals)) | |
# final statistics | |
statistics.append([f"ALL", np.mean(silhouette_vals), np.std(silhouette_vals)]) | |
# display | |
if verbose: | |
print(f"ALL", np.mean(silhouette_vals), np.std(silhouette_vals)) | |
# to df and return | |
return statistics | |
## estimate the optimal number of clusters for k-Means algorithm | |
def estimate_optimal_num_clusters_kmeans(X:np.array, max_num_clusters:int = 10, with_plot:bool = False)->int: | |
""" | |
Estimate the optimal number of clusters for k-Means algorithm. | |
X -- Array of features used to estimate the clustering. | |
max_num_clusters -- Maximum number of clusters to be tested. | |
with_plot -- Plot or not extra information (default, False). | |
return -- The most optimal number of clusters. | |
""" | |
# number of clusters to be tested | |
num_clusters_to_test = np.arange(2, max_num_clusters + 1, 1) | |
# initialize | |
metrics = np.array([]) | |
# loop of number of clusters | |
for nc in num_clusters_to_test: | |
## k-means clustering definition, fit and prediction | |
km = KMeans(n_clusters=nc , init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0) | |
y_km = km.fit_predict(X) | |
## cluster cuantification | |
metric = quantification_clustering_quality(X, y_km)[-1][1] | |
# append | |
metrics = np.append(metrics, metric) | |
# clean | |
del metric, km, y_km | |
# sort num of cluster according to metric values | |
num_clusters_sorted_according_metric = num_clusters_to_test[np.argsort(metrics)[::-1]] | |
# get the most optimal | |
num_cluster_optimal = num_clusters_sorted_according_metric[0] | |
# with plot if it is required | |
if with_plot: | |
## plot | |
plt.plot(num_clusters_to_test, metrics, color = "grey") | |
plt.scatter(num_clusters_to_test, metrics, | |
c='white', marker='o', edgecolor='black', s=25) | |
plt.axvline(num_cluster_optimal, color="red", linestyle="--") | |
plt.grid() | |
plt.tight_layout() | |
plt.title(f"Optimal Number of Clusters = {num_cluster_optimal}") | |
plt.xlabel("Number of clusters") | |
plt.ylabel('Silhouette coefficient') | |
plt.show() | |
# return | |
return num_cluster_optimal |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment