Skip to content

Instantly share code, notes, and snippets.

@adityajn105
Created July 8, 2018 21:05
Show Gist options
  • Save adityajn105/c675e90fc5e365f9fcac19e543094a10 to your computer and use it in GitHub Desktop.
Save adityajn105/c675e90fc5e365f9fcac19e543094a10 to your computer and use it in GitHub Desktop.
MeanShift algorithm for clustering of data.
"""
Author : Aditya Jain
Contact: [email protected]
"""
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
import random
from sklearn.datasets.samples_generator import make_blobs
#create random sample data
X,y = make_blobs(n_samples = 50,centers=3,n_features=2)
#X = np.array([[1,2],[1.5,1.8],[5,8],[8,8],[1,0.6],[9,11],[8,2],[10,2],[9,3]])
colors = 10*['g','r','c','b','k']
class MeanShift(object):
def __init__(self,radius=None,radius_norm_step=100):
self.radius=radius
self.radius_norm_step=radius_norm_step
def fit(self,data):
#selecting a suitable radius
if self.radius==None:
all_data_centroid = np.average(data,axis=0)
all_data_norm = np.linalg.norm(all_data_centroid)
self.radius = all_data_norm / self.radius_norm_step
centroids = {}
for i in range(len(data)):
centroids[i] = data[i]
while True:
new_centroids = []
for i in centroids:
in_bandwidth = []
centroid = centroids[i]
#weights = [99,98.....0]
weights = [i for i in range(self.radius_norm_step)][::-1]
for featureset in data:
distance = np.linalg.norm(featureset-centroid)
if distance==0:
distance = 0.00000000001
#deciding weight_index for giving weight
weight_index = int(distance/self.radius)
if weight_index>self.radius_norm_step-1:
weight_index = self.radius_norm_step-1
#the data with lower weight_index will have higher entries in to_bandwidth (maybe 99**2 entries in some case)
#this will give more weightage to that feature
to_add = (weights[weight_index]**2)*[featureset]
in_bandwidth += to_add
new_centroid = np.average(in_bandwidth,axis=0)
new_centroids.append(tuple(new_centroid))
uniques = sorted(list(set(new_centroids)))
#remove duplicate or near centroid from list of centroid list entry
to_pop = []
for i in uniques:
for ii in uniques:
if i == ii:
pass
elif np.linalg.norm(np.array(i)-np.array(ii))<=self.radius:
to_pop.append(ii)
break
for i in to_pop:
try:
uniques.remove(i)
except:
pass
#copy to new variable
prev_centroids = dict(centroids)
centroids = {}
for i in range(len(uniques)):
centroids[i] = np.array(uniques[i])
optimized=True
for i in centroids:
if not np.array_equal(centroids[i],prev_centroids[i]):
optimized= False
if not optimized:
break
if optimized:
break
self.centroids = centroids
self.classifications = {}
for i in range(len(self.centroids)):
self.classifications[i] = []
for featureset in data:
distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in centroids]
classification = distances.index(min(distances))
self.classifications[classification].append(featureset)
def predict(self,data):
distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in centroids]
classification = distance.index(min(distances))
return classification
clf = MeanShift()
clf.fit(X)
for centroid in clf.centroids:
plt.scatter(clf.centroids[centroid][0],clf.centroids[centroid][1],
marker="*",color="k",s=150)
for classification in clf.classifications:
color = colors[classification]
for featureset in clf.classifications[classification]:
plt.scatter(featureset[0],featureset[1],marker="x",
color=color,s=150,linewidths=5)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment