Created
July 8, 2018 21:05
-
-
Save adityajn105/c675e90fc5e365f9fcac19e543094a10 to your computer and use it in GitHub Desktop.
MeanShift algorithm for clustering of data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author : Aditya Jain | |
Contact: [email protected] | |
""" | |
import matplotlib.pyplot as plt | |
from matplotlib import style | |
style.use('ggplot') | |
import numpy as np | |
import random | |
from sklearn.datasets.samples_generator import make_blobs | |
#create random sample data | |
X,y = make_blobs(n_samples = 50,centers=3,n_features=2) | |
#X = np.array([[1,2],[1.5,1.8],[5,8],[8,8],[1,0.6],[9,11],[8,2],[10,2],[9,3]]) | |
colors = 10*['g','r','c','b','k'] | |
class MeanShift(object): | |
def __init__(self,radius=None,radius_norm_step=100): | |
self.radius=radius | |
self.radius_norm_step=radius_norm_step | |
def fit(self,data): | |
#selecting a suitable radius | |
if self.radius==None: | |
all_data_centroid = np.average(data,axis=0) | |
all_data_norm = np.linalg.norm(all_data_centroid) | |
self.radius = all_data_norm / self.radius_norm_step | |
centroids = {} | |
for i in range(len(data)): | |
centroids[i] = data[i] | |
while True: | |
new_centroids = [] | |
for i in centroids: | |
in_bandwidth = [] | |
centroid = centroids[i] | |
#weights = [99,98.....0] | |
weights = [i for i in range(self.radius_norm_step)][::-1] | |
for featureset in data: | |
distance = np.linalg.norm(featureset-centroid) | |
if distance==0: | |
distance = 0.00000000001 | |
#deciding weight_index for giving weight | |
weight_index = int(distance/self.radius) | |
if weight_index>self.radius_norm_step-1: | |
weight_index = self.radius_norm_step-1 | |
#the data with lower weight_index will have higher entries in to_bandwidth (maybe 99**2 entries in some case) | |
#this will give more weightage to that feature | |
to_add = (weights[weight_index]**2)*[featureset] | |
in_bandwidth += to_add | |
new_centroid = np.average(in_bandwidth,axis=0) | |
new_centroids.append(tuple(new_centroid)) | |
uniques = sorted(list(set(new_centroids))) | |
#remove duplicate or near centroid from list of centroid list entry | |
to_pop = [] | |
for i in uniques: | |
for ii in uniques: | |
if i == ii: | |
pass | |
elif np.linalg.norm(np.array(i)-np.array(ii))<=self.radius: | |
to_pop.append(ii) | |
break | |
for i in to_pop: | |
try: | |
uniques.remove(i) | |
except: | |
pass | |
#copy to new variable | |
prev_centroids = dict(centroids) | |
centroids = {} | |
for i in range(len(uniques)): | |
centroids[i] = np.array(uniques[i]) | |
optimized=True | |
for i in centroids: | |
if not np.array_equal(centroids[i],prev_centroids[i]): | |
optimized= False | |
if not optimized: | |
break | |
if optimized: | |
break | |
self.centroids = centroids | |
self.classifications = {} | |
for i in range(len(self.centroids)): | |
self.classifications[i] = [] | |
for featureset in data: | |
distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in centroids] | |
classification = distances.index(min(distances)) | |
self.classifications[classification].append(featureset) | |
def predict(self,data): | |
distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in centroids] | |
classification = distance.index(min(distances)) | |
return classification | |
clf = MeanShift() | |
clf.fit(X) | |
for centroid in clf.centroids: | |
plt.scatter(clf.centroids[centroid][0],clf.centroids[centroid][1], | |
marker="*",color="k",s=150) | |
for classification in clf.classifications: | |
color = colors[classification] | |
for featureset in clf.classifications[classification]: | |
plt.scatter(featureset[0],featureset[1],marker="x", | |
color=color,s=150,linewidths=5) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment