Joelfranklin96 · November 19, 2019 01:25
diff --git a/gistfile1.py b/gistfile1.py
 import numpy as np # Import the necessary packages.
 import pandas as pd 

 epsilon = list(range(4)) # Initialisation of epsilon which would store cost function for each value of 'k' after final iteration.

 for k in range(2,6): # Number of clusters
    
    cluster = pd.read_csv('/kaggle/input/clustering/k-means clustering.csv') # Read data file into 'cluster'
    
    rows = cluster.shape[0] # 'rows' contains the total number of rows in cluster data.
    cols = cluster.shape[1] # 'cols' contains the total number of columns in cluster data.
  
    centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids.
    centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'.
    centroids.set_index('new',inplace = True) 
    d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point.

    number_of_iterations = 15
    temp_epsilon = list(range(number_of_iterations)) # 'temp_epsilon' is the sum of squares of distances between points and centroid of a cluster for each iteration.

    for i in range(0,number_of_iterations): # This 'for' loop is for iterations.

        for j in range(0,rows): # This 'for' loop finds the centroid number closest to the data point.
            d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
        cluster['centroid number'] = d # A new column 'centroid number' is added to dataframe 'cluster'.

        mean_x = list(range(k)) # Initialisation of 'mean_x' which will store mean of 'x' values of each cluster.
        mean_y = list(range(k)) # Initialisation of 'mean_y' which will store mean of 'y' values of each cluster.
        for m in range(0,k): # This 'for' loop calculates mean of 'x' and 'y' values of each cluster.
            mean_x[m] = cluster[cluster['centroid number'] == (m+1)]['x'].mean()
            mean_y[m] = cluster[cluster['centroid number'] == (m+1)]['y'].mean()
        centroids.replace(list(centroids['x']),mean_x,inplace = True) # The 'centroids' are replaced with the new values.
        centroids.replace(list(centroids['y']),mean_y,inplace = True)
    
        z = list(range(k)) # Initialisation of z  and centroid of each cluster.
        for p in range(0,k): # This 'for' loop calculates square of distances between data points and centroid of each cluster.
            z[p] = ((cluster[cluster['centroid number'] == p+1][['x','y']] - centroids.iloc[p])**2).values.sum()
        temp_epsilon[i] = sum(z) 
        
        epsilon[k-2] = temp_epsilon[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.
    
    %reset_selective -f centroids # The dataframe 'centroids' is reset.
	import numpy as np # Import the necessary packages.
	import pandas as pd

	epsilon = list(range(4)) # Initialisation of epsilon which would store cost function for each value of 'k' after final iteration.

	for k in range(2,6): # Number of clusters

	cluster = pd.read_csv('/kaggle/input/clustering/k-means clustering.csv') # Read data file into 'cluster'

	rows = cluster.shape[0] # 'rows' contains the total number of rows in cluster data.
	cols = cluster.shape[1] # 'cols' contains the total number of columns in cluster data.

	centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids.
	centroids['new'] = list(range(1,k+1)) # New indices 1 to k are set for the dataframe 'centroids'.
	centroids.set_index('new',inplace = True)
	d = np.random.rand(rows) # Initialization of 'd' which would contain the centroid number closest to data point.

	number_of_iterations = 15
	temp_epsilon = list(range(number_of_iterations)) # 'temp_epsilon' is the sum of squares of distances between points and centroid of a cluster for each iteration.

	for i in range(0,number_of_iterations): # This 'for' loop is for iterations.

	for j in range(0,rows): # This 'for' loop finds the centroid number closest to the data point.
	d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin()
	cluster['centroid number'] = d # A new column 'centroid number' is added to dataframe 'cluster'.

	mean_x = list(range(k)) # Initialisation of 'mean_x' which will store mean of 'x' values of each cluster.
	mean_y = list(range(k)) # Initialisation of 'mean_y' which will store mean of 'y' values of each cluster.
	for m in range(0,k): # This 'for' loop calculates mean of 'x' and 'y' values of each cluster.
	mean_x[m] = cluster[cluster['centroid number'] == (m+1)]['x'].mean()
	mean_y[m] = cluster[cluster['centroid number'] == (m+1)]['y'].mean()
	centroids.replace(list(centroids['x']),mean_x,inplace = True) # The 'centroids' are replaced with the new values.
	centroids.replace(list(centroids['y']),mean_y,inplace = True)

	z = list(range(k)) # Initialisation of z and centroid of each cluster.
	for p in range(0,k): # This 'for' loop calculates square of distances between data points and centroid of each cluster.
	z[p] = ((cluster[cluster['centroid number'] == p+1][['x','y']] - centroids.iloc[p])**2).values.sum()
	temp_epsilon[i] = sum(z)

	epsilon[k-2] = temp_epsilon[i] # The cost function after final iteration for each value of 'k' would be stored in epsilon.

	%reset_selective -f centroids # The dataframe 'centroids' is reset.