Last active
April 27, 2022 00:44
-
-
Save sithu/e3a1df21413b59053a2c42f0a86892a5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import make_blobs | |
import matplotlib.pyplot as plt | |
from sklearn.cluster import KMeans | |
import numpy as np | |
from matplotlib import cm | |
from sklearn.metrics import silhouette_samples | |
import pandas as pd | |
from scipy.spatial.distance import pdist, squareform | |
from scipy.cluster.hierarchy import linkage | |
from scipy.cluster.hierarchy import dendrogram | |
from sklearn.cluster import AgglomerativeClustering | |
from sklearn.datasets import make_moons | |
from sklearn.cluster import DBSCAN | |
# Grouping objects by similarity using k-means | |
## K-means clustering using scikit-learn | |
# 2-D dataset of ramdon 150 points example | |
X, y = make_blobs(n_samples=150, | |
n_features=2, | |
centers=3, | |
cluster_std=0.5, | |
shuffle=True, | |
random_state=0) | |
plt.scatter(X[:, 0], X[:, 1], | |
c='white', marker='o', edgecolor='black', s=50) | |
plt.grid() | |
plt.tight_layout() | |
plt.show() | |
km = KMeans(n_clusters=3, | |
init='random', | |
n_init=10, | |
max_iter=300, | |
tol=1e-04, | |
random_state=0) | |
y_km = km.fit_predict(X) | |
plt.scatter(X[y_km == 0, 0], | |
X[y_km == 0, 1], | |
s=50, c='lightgreen', | |
marker='s', edgecolor='black', | |
label='cluster 1') | |
plt.scatter(X[y_km == 1, 0], | |
X[y_km == 1, 1], | |
s=50, c='orange', | |
marker='o', edgecolor='black', | |
label='cluster 2') | |
plt.scatter(X[y_km == 2, 0], | |
X[y_km == 2, 1], | |
s=50, c='lightblue', | |
marker='v', edgecolor='black', | |
label='cluster 3') | |
plt.scatter(km.cluster_centers_[:, 0], | |
km.cluster_centers_[:, 1], | |
s=250, marker='*', | |
c='red', edgecolor='black', | |
label='centroids') | |
plt.legend(scatterpoints=1) | |
plt.grid() | |
plt.tight_layout() | |
plt.show() | |
# ## A smarter way of placing the initial cluster centroids using k-means++ | |
# ## Hard versus soft clustering | |
# ## Using the elbow method to find the optimal number of clusters | |
print('Distortion: %.2f' % km.inertia_) | |
distortions = [] | |
for i in range(1, 11): | |
km = KMeans(n_clusters=i, | |
init='k-means++', | |
n_init=10, | |
max_iter=300, | |
random_state=0) | |
km.fit(X) | |
distortions.append(km.inertia_) | |
plt.plot(range(1, 11), distortions, marker='o') | |
plt.xlabel('Number of clusters') | |
plt.ylabel('Distortion') | |
plt.tight_layout() | |
plt.show() | |
# ## Quantifying the quality of clustering via silhouette plots | |
km = KMeans(n_clusters=3, | |
init='k-means++', | |
n_init=10, | |
max_iter=300, | |
tol=1e-04, | |
random_state=0) | |
y_km = km.fit_predict(X) | |
cluster_labels = np.unique(y_km) | |
n_clusters = cluster_labels.shape[0] | |
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') | |
y_ax_lower, y_ax_upper = 0, 0 | |
yticks = [] | |
for i, c in enumerate(cluster_labels): | |
c_silhouette_vals = silhouette_vals[y_km == c] | |
c_silhouette_vals.sort() | |
y_ax_upper += len(c_silhouette_vals) | |
color = cm.jet(float(i) / n_clusters) | |
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, | |
edgecolor='none', color=color) | |
yticks.append((y_ax_lower + y_ax_upper) / 2.) | |
y_ax_lower += len(c_silhouette_vals) | |
silhouette_avg = np.mean(silhouette_vals) | |
plt.axvline(silhouette_avg, color="red", linestyle="--") | |
plt.yticks(yticks, cluster_labels + 1) | |
plt.ylabel('Cluster') | |
plt.xlabel('Silhouette coefficient') | |
plt.tight_layout() | |
plt.show() | |
# Comparison to "bad" clustering: | |
km = KMeans(n_clusters=2, | |
init='k-means++', | |
n_init=10, | |
max_iter=300, | |
tol=1e-04, | |
random_state=0) | |
y_km = km.fit_predict(X) | |
plt.scatter(X[y_km == 0, 0], | |
X[y_km == 0, 1], | |
s=50, | |
c='lightgreen', | |
edgecolor='black', | |
marker='s', | |
label='cluster 1') | |
plt.scatter(X[y_km == 1, 0], | |
X[y_km == 1, 1], | |
s=50, | |
c='orange', | |
edgecolor='black', | |
marker='o', | |
label='cluster 2') | |
plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], | |
s=250, marker='*', c='red', label='centroids') | |
plt.legend() | |
plt.grid() | |
plt.tight_layout() | |
plt.show() | |
cluster_labels = np.unique(y_km) | |
n_clusters = cluster_labels.shape[0] | |
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') | |
y_ax_lower, y_ax_upper = 0, 0 | |
yticks = [] | |
for i, c in enumerate(cluster_labels): | |
c_silhouette_vals = silhouette_vals[y_km == c] | |
c_silhouette_vals.sort() | |
y_ax_upper += len(c_silhouette_vals) | |
color = cm.jet(float(i) / n_clusters) | |
plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, | |
edgecolor='none', color=color) | |
yticks.append((y_ax_lower + y_ax_upper) / 2.) | |
y_ax_lower += len(c_silhouette_vals) | |
silhouette_avg = np.mean(silhouette_vals) | |
plt.axvline(silhouette_avg, color="red", linestyle="--") | |
plt.yticks(yticks, cluster_labels + 1) | |
plt.ylabel('Cluster') | |
plt.xlabel('Silhouette coefficient') | |
plt.tight_layout() | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment