Last active
November 19, 2019 01:25
-
-
Save Joelfranklin96/e22a36622a094f0789c29b7f068e447e to your computer and use it in GitHub Desktop.
Elbow method of K-means clustering algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np # Import the necessary packages. | |
import pandas as pd | |
epsilon = list(range(4)) # Initialisation of epsilon which would store cost function for each value of 'k' after final iteration. | |
for k in range(2,6): # Number of clusters | |
cluster = pd.read_csv('/kaggle/input/clustering/k-means clustering.csv') # Read data file into 'cluster' | |
rows = cluster.shape[0] # 'rows' contains the total number of rows in cluster data. | |
cols = cluster.shape[1] # 'cols' contains the total number of columns in cluster data. | |
centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids. | |
centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'. | |
centroids.set_index('new',inplace = True) | |
d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point. | |
number_of_iterations = 15 | |
temp_epsilon = list(range(number_of_iterations)) # 'temp_epsilon' is the sum of squares of distances between points and centroid of a cluster for each iteration. | |
for i in range(0,number_of_iterations): # This 'for' loop is for iterations. | |
for j in range(0,rows): # This 'for' loop finds the centroid number closest to the data point. | |
d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin() | |
cluster['centroid number'] = d # A new column 'centroid number' is added to dataframe 'cluster'. | |
mean_x = list(range(k)) # Initialisation of 'mean_x' which will store mean of 'x' values of each cluster. | |
mean_y = list(range(k)) # Initialisation of 'mean_y' which will store mean of 'y' values of each cluster. | |
for m in range(0,k): # This 'for' loop calculates mean of 'x' and 'y' values of each cluster. | |
mean_x[m] = cluster[cluster['centroid number'] == (m+1)]['x'].mean() | |
mean_y[m] = cluster[cluster['centroid number'] == (m+1)]['y'].mean() | |
centroids.replace(list(centroids['x']),mean_x,inplace = True) # The 'centroids' are replaced with the new values. | |
centroids.replace(list(centroids['y']),mean_y,inplace = True) | |
z = list(range(k)) # Initialisation of z and centroid of each cluster. | |
for p in range(0,k): # This 'for' loop calculates square of distances between data points and centroid of each cluster. | |
z[p] = ((cluster[cluster['centroid number'] == p+1][['x','y']] - centroids.iloc[p])**2).values.sum() | |
temp_epsilon[i] = sum(z) | |
epsilon[k-2] = temp_epsilon[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon. | |
%reset_selective -f centroids # The dataframe 'centroids' is reset. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment