Created
April 24, 2021 15:55
-
-
Save creotiv/633a17cf6953e9934a744a2de454f652 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import make_blobs | |
from collections import defaultdict | |
# Generate sample data | |
n_samples = 4000 | |
n_components = 4 | |
X, y_true = make_blobs(n_samples=n_samples, | |
centers=n_components, | |
cluster_std=0.99, | |
random_state=0) | |
plt.figure(1) | |
colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm'] | |
for k, col in enumerate(colors): | |
cluster_data = y_true == k | |
plt.scatter(X[cluster_data, 0], X[cluster_data, 1], | |
c=col, marker='.', s=10) | |
def print_cluster(x, clusters, centroids=None): | |
plt.figure(1) | |
colors=plt.cm.rainbow(np.linspace(0,1,len(clusters.keys()))) | |
for k, col in enumerate(colors): | |
plt.scatter(x[clusters[k], 0], x[clusters[k], 1], | |
color=col, marker='.', s=10) | |
if centroids is not None: | |
plt.scatter(centroids[:, 0], centroids[:, 1], | |
color='black', marker='x', s=10) | |
plt.show() | |
def euclid(x,y): | |
return np.sqrt(np.sum((x-y)**2)) | |
def kmeans(X, num_clusters=4, toll=0.1, distance=euclid): | |
def get_centroid(x): | |
return np.mean(x, axis=0) | |
def get_centroids(x, clusters): | |
_min = np.min(x, axis=0) | |
_max = np.max(x, axis=0) | |
res = [] | |
_,s = x.shape | |
for i in range(clusters): | |
res.append(_min + np.random.rand(s)*(_max-_min)) | |
return np.array(res) | |
def assign_to_clusters(x,centroids): | |
clusters = {} | |
for i in range(len(centroids)): | |
clusters[i] = [] | |
for xi, _x in enumerate(x): | |
cluster = -1 | |
_min = np.inf | |
for i, c in enumerate(centroids): | |
d = distance(_x, c) | |
if d < _min: | |
_min = d | |
cluster = i | |
clusters[cluster].append(xi) | |
return clusters | |
def sse(x, clusters, centroids): | |
res = [] | |
for c,v in clusters.items(): | |
data = x[v] | |
_c = centroids[c] | |
for _x in data: | |
res.append(distance(_x, _c)**2) | |
return sum(res) | |
centroids = get_centroids(X, num_clusters) | |
clusters = assign_to_clusters(X, centroids) | |
prev_err = 0 | |
for i in range(100): | |
new_centroids = [] | |
for k in range(num_clusters): | |
new_centroids.append(get_centroid(X[clusters[k]])) | |
centroids = np.array(new_centroids) | |
clusters = assign_to_clusters(X, centroids) | |
print_cluster(X, clusters, centroids) | |
err = sse(X, clusters, centroids) | |
if abs(prev_err - err) < toll: | |
print('No change. Breaked') | |
break | |
prev_err = err | |
print('SSE: %s' % err) | |
kmeans(X,4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment