Skip to content

Instantly share code, notes, and snippets.

@creotiv
Created April 24, 2021 15:55
Show Gist options
  • Save creotiv/633a17cf6953e9934a744a2de454f652 to your computer and use it in GitHub Desktop.
Save creotiv/633a17cf6953e9934a744a2de454f652 to your computer and use it in GitHub Desktop.
from sklearn.datasets import make_blobs
from collections import defaultdict
# Generate sample data
n_samples = 4000
n_components = 4
X, y_true = make_blobs(n_samples=n_samples,
centers=n_components,
cluster_std=0.99,
random_state=0)
plt.figure(1)
colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm']
for k, col in enumerate(colors):
cluster_data = y_true == k
plt.scatter(X[cluster_data, 0], X[cluster_data, 1],
c=col, marker='.', s=10)
def print_cluster(x, clusters, centroids=None):
plt.figure(1)
colors=plt.cm.rainbow(np.linspace(0,1,len(clusters.keys())))
for k, col in enumerate(colors):
plt.scatter(x[clusters[k], 0], x[clusters[k], 1],
color=col, marker='.', s=10)
if centroids is not None:
plt.scatter(centroids[:, 0], centroids[:, 1],
color='black', marker='x', s=10)
plt.show()
def euclid(x,y):
return np.sqrt(np.sum((x-y)**2))
def kmeans(X, num_clusters=4, toll=0.1, distance=euclid):
def get_centroid(x):
return np.mean(x, axis=0)
def get_centroids(x, clusters):
_min = np.min(x, axis=0)
_max = np.max(x, axis=0)
res = []
_,s = x.shape
for i in range(clusters):
res.append(_min + np.random.rand(s)*(_max-_min))
return np.array(res)
def assign_to_clusters(x,centroids):
clusters = {}
for i in range(len(centroids)):
clusters[i] = []
for xi, _x in enumerate(x):
cluster = -1
_min = np.inf
for i, c in enumerate(centroids):
d = distance(_x, c)
if d < _min:
_min = d
cluster = i
clusters[cluster].append(xi)
return clusters
def sse(x, clusters, centroids):
res = []
for c,v in clusters.items():
data = x[v]
_c = centroids[c]
for _x in data:
res.append(distance(_x, _c)**2)
return sum(res)
centroids = get_centroids(X, num_clusters)
clusters = assign_to_clusters(X, centroids)
prev_err = 0
for i in range(100):
new_centroids = []
for k in range(num_clusters):
new_centroids.append(get_centroid(X[clusters[k]]))
centroids = np.array(new_centroids)
clusters = assign_to_clusters(X, centroids)
print_cluster(X, clusters, centroids)
err = sse(X, clusters, centroids)
if abs(prev_err - err) < toll:
print('No change. Breaked')
break
prev_err = err
print('SSE: %s' % err)
kmeans(X,4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment