Skip to content

Instantly share code, notes, and snippets.

@Joelfranklin96
Last active November 19, 2019 01:25
Show Gist options
  • Save Joelfranklin96/e22a36622a094f0789c29b7f068e447e to your computer and use it in GitHub Desktop.
Save Joelfranklin96/e22a36622a094f0789c29b7f068e447e to your computer and use it in GitHub Desktop.
Elbow method of K-means clustering algorithm
import numpy as np # Import the necessary packages.
import pandas as pd
epsilon = list(range(4)) # Initialisation of epsilon which would store cost function for each value of 'k' after final iteration.
for k in range(2,6): # Number of clusters
cluster = pd.read_csv('/kaggle/input/clustering/k-means clustering.csv') # Read data file into 'cluster'
rows = cluster.shape[0] # 'rows' contains the total number of rows in cluster data.
cols = cluster.shape[1] # 'cols' contains the total number of columns in cluster data.
centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids.
centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'.
centroids.set_index('new',inplace = True)
d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point.
number_of_iterations = 15
temp_epsilon = list(range(number_of_iterations)) # 'temp_epsilon' is the sum of squares of distances between points and centroid of a cluster for each iteration.
for i in range(0,number_of_iterations): # This 'for' loop is for iterations.
for j in range(0,rows): # This 'for' loop finds the centroid number closest to the data point.
d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
cluster['centroid number'] = d # A new column 'centroid number' is added to dataframe 'cluster'.
mean_x = list(range(k)) # Initialisation of 'mean_x' which will store mean of 'x' values of each cluster.
mean_y = list(range(k)) # Initialisation of 'mean_y' which will store mean of 'y' values of each cluster.
for m in range(0,k): # This 'for' loop calculates mean of 'x' and 'y' values of each cluster.
mean_x[m] = cluster[cluster['centroid number'] == (m+1)]['x'].mean()
mean_y[m] = cluster[cluster['centroid number'] == (m+1)]['y'].mean()
centroids.replace(list(centroids['x']),mean_x,inplace = True) # The 'centroids' are replaced with the new values.
centroids.replace(list(centroids['y']),mean_y,inplace = True)
z = list(range(k)) # Initialisation of z and centroid of each cluster.
for p in range(0,k): # This 'for' loop calculates square of distances between data points and centroid of each cluster.
z[p] = ((cluster[cluster['centroid number'] == p+1][['x','y']] - centroids.iloc[p])**2).values.sum()
temp_epsilon[i] = sum(z)
epsilon[k-2] = temp_epsilon[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.
%reset_selective -f centroids # The dataframe 'centroids' is reset.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment