Skip to content

Instantly share code, notes, and snippets.

@briverse17
Created December 14, 2019 17:49
Show Gist options
  • Save briverse17/ad19abded1c140dcc1bf280080cd6383 to your computer and use it in GitHub Desktop.
Save briverse17/ad19abded1c140dcc1bf280080cd6383 to your computer and use it in GitHub Desktop.
class k_medoids:
def __init__(self, k = 2, max_iter = 300, has_converged = False):
'''
Class constructor
Parameters
----------
- k: number of clusters.
- max_iter: number of times centroids will move
- has_converged: to check if the algorithm stop or not
'''
self.k = k
self.max_iter = max_iter
self.has_converged = has_converged
self.medoids_cost = []
def initMedoids(self, X):
'''
Parameters
----------
X: input data.
'''
self.medoids = []
#Starting medoids will be random members from data set X
indexes = np.random.randint(0, len(X)-1,self.k)
self.medoids = X[indexes]
for i in range(0,self.k):
self.medoids_cost.append(0)
def isConverged(self, new_medoids):
'''
Parameters
----------
new_medoids: the recently calculated medoids to be compared with the current medoids stored in the class
'''
return set([tuple(x) for x in self.medoids]) == set([tuple(x) for x in new_medoids])
def updateMedoids(self, X, labels):
'''
Parameters
----------
labels: a list contains labels of data points
'''
self.has_converged = True
#Store data points to the current cluster they belong to
clusters = []
for i in range(0,self.k):
cluster = []
for j in range(len(X)):
if (labels[j] == i):
cluster.append(X[j])
clusters.append(cluster)
#Calculate the new medoids
new_medoids = []
for i in range(0, self.k):
new_medoid = self.medoids[i]
old_medoids_cost = self.medoids_cost[i]
for j in range(len(clusters[i])):
#Cost of the current data points to be compared with the current optimal cost
cur_medoids_cost = 0
for dpoint_index in range(len(clusters[i])):
cur_medoids_cost += euclideanDistance(clusters[i][j], clusters[i][dpoint_index])
#If current cost is less than current optimal cost,
#make the current data point new medoid of the cluster
if cur_medoids_cost < old_medoids_cost:
new_medoid = clusters[i][j]
old_medoids_cost = cur_medoids_cost
#Now we have the optimal medoid of the current cluster
new_medoids.append(new_medoid)
#If not converged yet, accept the new medoids
if not self.isConverged(new_medoids):
self.medoids = new_medoids
self.has_converged = False
def fit(self, X):
'''
FIT function, used to find clusters
Parameters
----------
X: input data.
'''
self.initMedoids(X)
for i in range(self.max_iter):
#Labels for this iteration
cur_labels = []
for medoid in range(0,self.k):
#Dissimilarity cost of the current cluster
self.medoids_cost[medoid] = 0
for k in range(len(X)):
#Distances from a data point to each of the medoids
d_list = []
for j in range(0,self.k):
d_list.append(euclideanDistance(self.medoids[j], X[k]))
#Data points' label is the medoid which has minimal distance to it
cur_labels.append(d_list.index(min(d_list)))
self.medoids_cost[medoid] += min(d_list)
self.updateMedoids(X, cur_labels)
if self.has_converged:
break
return np.array(self.medoids)
def predict(self,data):
'''
Parameters
----------
data: input data.
Returns:
----------
pred: list cluster indexes of input data
'''
pred = []
for i in range(len(data)):
#Distances from a data point to each of the medoids
d_list = []
for j in range(len(self.medoids)):
d_list.append(euclideanDistance(self.medoids[j],data[i]))
pred.append(d_list.index(min(d_list)))
return np.array(pred)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment