Skip to content

Instantly share code, notes, and snippets.

@talhasyed
Created March 19, 2010 20:39
Show Gist options
  • Save talhasyed/338149 to your computer and use it in GitHub Desktop.
Save talhasyed/338149 to your computer and use it in GitHub Desktop.
Just trying out K-Means Clustering by following a blog post about it
require 'pp'
def k_means_cluster(clusters=[[]])
# pp clusters
clusters_with_centroids = clusters.inject({}) do |centroids, cluster|
cluster_centroid = (cluster.inject(0.0) {|item, sum| sum+=item})/cluster.size
centroids[cluster_centroid] = cluster
centroids
end
clusters_with_centroids.each do |centroid, cluster|
cluster.each do |node|
cluster_key_to_assign_to = find_nearest_cluster(node, clusters_with_centroids.keys)
nearest_cluster = clusters_with_centroids[cluster_key_to_assign_to]
unless nearest_cluster==cluster
clusters_with_centroids[centroid].delete(node)
clusters_with_centroids[cluster_key_to_assign_to] << node
next
end
end
end
if clusters_with_centroids.values.sort == clusters
return clusters
else
return k_means_cluster(clusters_with_centroids.values.sort)
end
end
def find_nearest_cluster(node, clusters_with_centroids)
nearest = clusters_with_centroids.first
clusters_with_centroids[1..clusters_with_centroids.length].each do |centroid|
nearest = centroid if ((centroid - node).abs <= (nearest-node).abs)
end
nearest
end
def initial_clusters(data=[], num_clusters=5)
initial_coords = inital_cluster_coords(data.size, num_clusters)
initial_clusters = []
prev_index = 0
initial_coords.each_with_index do |coord, i|
cluster = data[prev_index..coord]
prev_index=coord+1
initial_clusters << cluster
end
initial_clusters
end
def inital_cluster_coords(data_size, num_clusters=5)
raise(Exception, "Data Size #{data_size} can't be less than the number of clusters #{num_clusters}") if num_clusters > data_size
cluster_coords = []
while (cluster_coords.size+1) < num_clusters
coord = (rand(data_size))
cluster_coords << coord unless (cluster_coords.member?(coord) || coord==data_size-1)
end
cluster_coords << data_size-1
cluster_coords.sort
end
Data_Size = 10
Clusters = 8
# data = [1, 1, 1, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 100]
data = []
(1..Data_Size).each do |i|
num = (rand(7)+1)
data << num
end
data.sort!
initial_clusters = initial_clusters(data, Clusters)
new_clusters = k_means_cluster(initial_clusters)
pp initial_clusters
pp new_clusters
# Pick number of clusters (k)
# Make k points (they're going to be the centroids)
# Randomize all these points location
# Calculate Euclidean distance from each point to all centroids
# Assign 'membership' of each point to the nearest centroid
# Establish the new centroids by averageing locations of all points belonging to a given cluster
# Goto 4 Until convergence is achieved, or changes made are irrelevant.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment