Last active
November 23, 2019 09:30
-
-
Save Joelfranklin96/e61e51c5ddf4d3593fed0cd127a378fb to your computer and use it in GitHub Desktop.
K-means clustering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Necessary packages are imported. | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Loading of data set into 'cluster'. | |
cluster = pd.read_csv('k-means clustering.csv') | |
k = 3 # Number of clusters | |
rows = cluster.shape[0] # 'rows' contains the total number of rows in cluster data. | |
cols = cluster.shape[1] # 'cols' contains the total number of columns in cluster data. | |
centroids = cluster.loc[np.random.randint(1,rows+1,k)] # Randomly initialises 'k' no. of centroids. | |
centroids['new'] = list(range(1,k+1)) | |
centroids.set_index('new',inplace = True) # New indices 1 to k are set for the dataframe 'centroids'. | |
d = np.random.rand(rows) # Initialization of d which would contain the centroid number closest to data point. | |
number_of_iterations = 5 | |
epsilon = list(range(number_of_iterations)) # 'epsilon' is the sum of squares of distances between points and centroid of a cluster for each iteration | |
for i in range(0,number_of_iterations): # This 'for' loop is for iterations. | |
for j in range(0,rows): # This 'for' loop finds the centroid number closest to the data point. | |
d[j] = ((centroids - cluster.loc[j])**2).sum(axis = 1).idxmin() | |
cluster['centroid number'] = d # A new column 'centroid number' is added to dataframe 'cluster'. | |
plt.subplots_adjust(bottom=0.1, right=2, top=0.9) # Adjusts the subplot. | |
plt.subplot(1,number_of_iterations,i+1) | |
sns.scatterplot(x = 'x',y = 'y',data = cluster,hue = 'centroid number',legend = 'full') # Scatter plot is plotted with differentiating factor as 'centroid number' | |
plt.legend(bbox_to_anchor=(1, 1), loc=4, borderaxespad=0.5) #Adjusts the legend box. | |
mean_x = list(range(k)) # Initialisation of 'mean_x' which will store mean of 'x' values of each cluster. | |
mean_y = list(range(k)) # Initialisation of 'mean_y' which will store mean of 'y' values of each cluster. | |
for m in range(0,k): # This 'for' loop calculates mean of 'x' and 'y' values of each cluster. | |
mean_x[m] = cluster[cluster['centroid number'] == (m+1)]['x'].mean() | |
mean_y[m] = cluster[cluster['centroid number'] == (m+1)]['y'].mean() | |
centroids.replace(list(centroids['x']),mean_x,inplace = True) # The 'centroids' are replaced with the new values. | |
centroids.replace(list(centroids['y']),mean_y,inplace = True) | |
z = list(range(k)) # Initialisation of z and centroid of each cluster. | |
for p in range(0,k): # This 'for' loop calculates square of distances between data points and centroid of each cluster. | |
z[p] = ((cluster[cluster['centroid number'] == p+1][['x','y']] - centroids.iloc[p])**2).values.sum() | |
epsilon[i] = sum(z) # 'epsilon' is sum of squares of distances between points and centroid of a cluster for each iteration |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment