Last active
July 7, 2021 05:00
-
-
Save fmarthoz/7c936a0dfeb51cbc47e8757fc27d5f38 to your computer and use it in GitHub Desktop.
For a Medium article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import StandardScaler | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import random | |
import seaborn as sns | |
from sklearn.preprocessing import StandardScaler | |
# We define a function returning the centroid of a set of points and the distance between two points | |
def centroid(df): | |
return(df.mean().values) | |
def distance(p1,p2): | |
p1=np.asarray(p1) | |
p2=np.asarray(p2) | |
d=np.sum((p1-p2)**2) | |
return(np.sqrt(d)) | |
# STEP 0: Scale the data | |
X.iloc[:,0:2] = StandardScaler().fit_transform(X) | |
# STEP 1: we choose a number k of clusters | |
k=3 | |
# Step 2: Select k random points from the data as centroids | |
centroids=X.sample(k) | |
# Using a very simple stop criterio: 10 iterations | |
iter=10 | |
for n in range(0,iter): | |
# for each row we calculate the distance to each centroid | |
d=[] | |
for i in range(0,k): | |
d.append([distance(x[1],centroids.iloc[i,:]) for x in X.iterrows()]) | |
# We assign each point to the nearest centroid | |
clusters=[] | |
d=np.asarray(d) | |
for j in range(0,len(d[0])): | |
clusters.append(np.argmin(d[0:k,j])) | |
# We recalculate the clusters | |
for m in range(0,k): | |
centroids.iloc[m,:]=(centroid(X[pd.Series(clusters)==m])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment