Created
March 19, 2010 20:39
-
-
Save talhasyed/338149 to your computer and use it in GitHub Desktop.
Just trying out K-Means Clustering by following a blog post about it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'pp' | |
def k_means_cluster(clusters=[[]]) | |
# pp clusters | |
clusters_with_centroids = clusters.inject({}) do |centroids, cluster| | |
cluster_centroid = (cluster.inject(0.0) {|item, sum| sum+=item})/cluster.size | |
centroids[cluster_centroid] = cluster | |
centroids | |
end | |
clusters_with_centroids.each do |centroid, cluster| | |
cluster.each do |node| | |
cluster_key_to_assign_to = find_nearest_cluster(node, clusters_with_centroids.keys) | |
nearest_cluster = clusters_with_centroids[cluster_key_to_assign_to] | |
unless nearest_cluster==cluster | |
clusters_with_centroids[centroid].delete(node) | |
clusters_with_centroids[cluster_key_to_assign_to] << node | |
next | |
end | |
end | |
end | |
if clusters_with_centroids.values.sort == clusters | |
return clusters | |
else | |
return k_means_cluster(clusters_with_centroids.values.sort) | |
end | |
end | |
def find_nearest_cluster(node, clusters_with_centroids) | |
nearest = clusters_with_centroids.first | |
clusters_with_centroids[1..clusters_with_centroids.length].each do |centroid| | |
nearest = centroid if ((centroid - node).abs <= (nearest-node).abs) | |
end | |
nearest | |
end | |
def initial_clusters(data=[], num_clusters=5) | |
initial_coords = inital_cluster_coords(data.size, num_clusters) | |
initial_clusters = [] | |
prev_index = 0 | |
initial_coords.each_with_index do |coord, i| | |
cluster = data[prev_index..coord] | |
prev_index=coord+1 | |
initial_clusters << cluster | |
end | |
initial_clusters | |
end | |
def inital_cluster_coords(data_size, num_clusters=5) | |
raise(Exception, "Data Size #{data_size} can't be less than the number of clusters #{num_clusters}") if num_clusters > data_size | |
cluster_coords = [] | |
while (cluster_coords.size+1) < num_clusters | |
coord = (rand(data_size)) | |
cluster_coords << coord unless (cluster_coords.member?(coord) || coord==data_size-1) | |
end | |
cluster_coords << data_size-1 | |
cluster_coords.sort | |
end | |
Data_Size = 10 | |
Clusters = 8 | |
# data = [1, 1, 1, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 100] | |
data = [] | |
(1..Data_Size).each do |i| | |
num = (rand(7)+1) | |
data << num | |
end | |
data.sort! | |
initial_clusters = initial_clusters(data, Clusters) | |
new_clusters = k_means_cluster(initial_clusters) | |
pp initial_clusters | |
pp new_clusters | |
# Pick number of clusters (k) | |
# Make k points (they're going to be the centroids) | |
# Randomize all these points location | |
# Calculate Euclidean distance from each point to all centroids | |
# Assign 'membership' of each point to the nearest centroid | |
# Establish the new centroids by averageing locations of all points belonging to a given cluster | |
# Goto 4 Until convergence is achieved, or changes made are irrelevant. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment