fmarthoz · July 7, 2021 05:00
diff --git a/Simple k-means b/Simple k-means
 from sklearn.preprocessing import StandardScaler
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import random
 import seaborn as sns
 from sklearn.preprocessing import StandardScaler

 # We define a function returning the centroid of a set of points and the distance between two points

 def centroid(df):
    return(df.mean().values)

 def distance(p1,p2):
    p1=np.asarray(p1)
    p2=np.asarray(p2)
    d=np.sum((p1-p2)**2)
    return(np.sqrt(d))

 # STEP 0: Scale the data
 X.iloc[:,0:2] = StandardScaler().fit_transform(X)

 # STEP 1: we choose a number k of clusters
 k=3

 # Step 2: Select k random points from the data as centroids
 centroids=X.sample(k)

 # Using a very simple stop criterio: 10 iterations
 iter=10
 for n in range(0,iter):
    # for each row we calculate the distance to each centroid
    d=[]
    for i in range(0,k):
        d.append([distance(x[1],centroids.iloc[i,:]) for x in X.iterrows()])
        
    # We assign each point to the nearest centroid
    clusters=[]
    d=np.asarray(d)
    for j in range(0,len(d[0])):
        clusters.append(np.argmin(d[0:k,j]))
    
    # We recalculate the clusters
    for m in range(0,k):
        centroids.iloc[m,:]=(centroid(X[pd.Series(clusters)==m]))
	from sklearn.preprocessing import StandardScaler
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import random
	import seaborn as sns
	from sklearn.preprocessing import StandardScaler

	# We define a function returning the centroid of a set of points and the distance between two points

	def centroid(df):
	return(df.mean().values)

	def distance(p1,p2):
	p1=np.asarray(p1)
	p2=np.asarray(p2)
	d=np.sum((p1-p2)**2)
	return(np.sqrt(d))

	# STEP 0: Scale the data
	X.iloc[:,0:2] = StandardScaler().fit_transform(X)

	# STEP 1: we choose a number k of clusters
	k=3

	# Step 2: Select k random points from the data as centroids
	centroids=X.sample(k)

	# Using a very simple stop criterio: 10 iterations
	iter=10
	for n in range(0,iter):
	# for each row we calculate the distance to each centroid
	d=[]
	for i in range(0,k):
	d.append([distance(x[1],centroids.iloc[i,:]) for x in X.iterrows()])

	# We assign each point to the nearest centroid
	clusters=[]
	d=np.asarray(d)
	for j in range(0,len(d[0])):
	clusters.append(np.argmin(d[0:k,j]))

	# We recalculate the clusters
	for m in range(0,k):
	centroids.iloc[m,:]=(centroid(X[pd.Series(clusters)==m]))