Created
February 24, 2019 23:05
-
-
Save aerinkim/71ad7571e2224236a7d44460e8840497 to your computer and use it in GitHub Desktop.
K-means Python Implementation from scratch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import datasets | |
def Kmeans(X, K): | |
m = len(X) | |
X_centroid = dict() # Save which sample belong to which cluster. | |
X_centroid.fromkeys(range(0, m)) | |
C = dict() # Save cluster's cordinate | |
C.fromkeys(range(0, K)) | |
old_C = None # Cache to save old C. Used for an early termination. | |
# 1. Randomly initialize k centroids. | |
rand = random.sample(range(m), K) | |
for i in range(0, K): | |
C[i] = [X[rand[i]], 0] | |
# 2. Iterate 1000 times. | |
for iteration in range(0, 1000): | |
# 3. Assign the centroid for each sample. This line determines the computational complexity : m * K * d. (d is len(X[0]).) | |
for i in range(0, m): | |
dist = [np.linalg.norm(X[i] - C[j][0], 2) for j in range(0, K)] | |
X_centroid[i] = np.argmin(dist) | |
# 4. Calculate the new centroid cordinates (Notice I'm only reading the data once for the efficiency.) | |
temp_coordinates_counts = [[np.zeros(len(X[0])), 0] for _ in range(0, K)] | |
for i in range(0, m): | |
temp_coordinates_counts[X_centroid[i]][0] += X[i] | |
temp_coordinates_counts[X_centroid[i]][1] += 1 | |
for i in range(0, K): | |
coordinates_sum = temp_coordinates_counts[i][0] | |
count = temp_coordinates_counts[i][1] | |
C[i] = [coordinates_sum / count, count] | |
if old_C == C: # Early termination | |
break | |
old_C = C | |
return X_centroid, C | |
if __name__ == '__main__': | |
np.random.seed(5) | |
iris = datasets.load_iris() | |
X = iris.data | |
K = 3 | |
# From Scratch | |
X_centroid, C = Kmeans(X, K) | |
print("From Scratch: ", C) | |
# From SKLearn | |
from sklearn.cluster import KMeans | |
kmeans = KMeans(n_clusters=K) | |
kmeans = kmeans.fit(X) | |
print("From SKlearn: ", kmeans.cluster_centers_) | |
# They match! | |
""" | |
<Note : Algorithm Properties> | |
1. K-means results are sensitive to initialization. | |
2. Complexity is O( m * K * I * d ) | |
– m = number of points, K = number of clusters, I = number of iterations, d = number of features. | |
3. Guarantee: Monotonically decreases the objective function ("dist" variable in the code) until convergence. | |
4. Scalability : Every iteration is linear in the size of its input. | |
5. K-means is a special case of EM clustering. Think about it! | |
6. There is no accepted method to discover k!!! | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment