talhasyed · March 19, 2010 20:39
diff --git a/k_means_clustering.rb b/k_means_clustering.rb
 require 'pp'

 def k_means_cluster(clusters=[[]])
  # pp clusters
  clusters_with_centroids = clusters.inject({}) do |centroids, cluster|
    
    cluster_centroid = (cluster.inject(0.0) {|item, sum| sum+=item})/cluster.size
    centroids[cluster_centroid] = cluster
    
    centroids
  end
  
  
  clusters_with_centroids.each do |centroid, cluster|
    cluster.each do |node|
      cluster_key_to_assign_to = find_nearest_cluster(node, clusters_with_centroids.keys)
      nearest_cluster = clusters_with_centroids[cluster_key_to_assign_to]
      unless nearest_cluster==cluster
        clusters_with_centroids[centroid].delete(node)
        clusters_with_centroids[cluster_key_to_assign_to] << node
        next
      end
    end
  end
  
  if clusters_with_centroids.values.sort == clusters
    return clusters
  else
    return k_means_cluster(clusters_with_centroids.values.sort)
  end
 end

 def find_nearest_cluster(node, clusters_with_centroids)
  nearest = clusters_with_centroids.first
  clusters_with_centroids[1..clusters_with_centroids.length].each do |centroid|
    nearest = centroid if ((centroid - node).abs <= (nearest-node).abs)
  end
  nearest
 end
  
 def initial_clusters(data=[], num_clusters=5)
  initial_coords = inital_cluster_coords(data.size, num_clusters)
  initial_clusters = []
  prev_index = 0
  
  initial_coords.each_with_index do |coord, i|
    cluster = data[prev_index..coord]
    prev_index=coord+1
    initial_clusters << cluster
  end
  
  initial_clusters
 end

 def inital_cluster_coords(data_size, num_clusters=5)
  raise(Exception, "Data Size #{data_size} can't be less than the number of clusters #{num_clusters}") if num_clusters > data_size
  cluster_coords = []
  
  while (cluster_coords.size+1) < num_clusters
    coord = (rand(data_size))
    cluster_coords << coord unless (cluster_coords.member?(coord) || coord==data_size-1)
  end
  
  cluster_coords << data_size-1
  cluster_coords.sort
 end

 Data_Size = 10
 Clusters = 8

 # data = [1, 1, 1, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 100]
 data = []
 (1..Data_Size).each do |i|
  num = (rand(7)+1)
  data << num
 end
 data.sort!

 initial_clusters = initial_clusters(data, Clusters)
 new_clusters = k_means_cluster(initial_clusters)

 pp initial_clusters
 pp new_clusters

 # Pick number of clusters (k)
 # Make k points (they're going to be the centroids)
 # Randomize all these points location
 # Calculate Euclidean distance from each point to all centroids
 # Assign 'membership' of each point to the nearest centroid
 # Establish the new centroids by averageing locations of all points belonging to a given cluster
 # Goto 4 Until convergence is achieved, or changes made are irrelevant.
	require 'pp'

	def k_means_cluster(clusters=[[]])
	# pp clusters
	clusters_with_centroids = clusters.inject({}) do \|centroids, cluster\|

	cluster_centroid = (cluster.inject(0.0) {\|item, sum\| sum+=item})/cluster.size
	centroids[cluster_centroid] = cluster

	centroids
	end


	clusters_with_centroids.each do \|centroid, cluster\|
	cluster.each do \|node\|
	cluster_key_to_assign_to = find_nearest_cluster(node, clusters_with_centroids.keys)
	nearest_cluster = clusters_with_centroids[cluster_key_to_assign_to]
	unless nearest_cluster==cluster
	clusters_with_centroids[centroid].delete(node)
	clusters_with_centroids[cluster_key_to_assign_to] << node
	next
	end
	end
	end

	if clusters_with_centroids.values.sort == clusters
	return clusters
	else
	return k_means_cluster(clusters_with_centroids.values.sort)
	end
	end

	def find_nearest_cluster(node, clusters_with_centroids)
	nearest = clusters_with_centroids.first
	clusters_with_centroids[1..clusters_with_centroids.length].each do \|centroid\|
	nearest = centroid if ((centroid - node).abs <= (nearest-node).abs)
	end
	nearest
	end

	def initial_clusters(data=[], num_clusters=5)
	initial_coords = inital_cluster_coords(data.size, num_clusters)
	initial_clusters = []
	prev_index = 0

	initial_coords.each_with_index do \|coord, i\|
	cluster = data[prev_index..coord]
	prev_index=coord+1
	initial_clusters << cluster
	end

	initial_clusters
	end

	def inital_cluster_coords(data_size, num_clusters=5)
	raise(Exception, "Data Size #{data_size} can't be less than the number of clusters #{num_clusters}") if num_clusters > data_size
	cluster_coords = []

	while (cluster_coords.size+1) < num_clusters
	coord = (rand(data_size))
	cluster_coords << coord unless (cluster_coords.member?(coord) \|\| coord==data_size-1)
	end

	cluster_coords << data_size-1
	cluster_coords.sort
	end

	Data_Size = 10
	Clusters = 8

	# data = [1, 1, 1, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 100]
	data = []
	(1..Data_Size).each do \|i\|
	num = (rand(7)+1)
	data << num
	end
	data.sort!

	initial_clusters = initial_clusters(data, Clusters)
	new_clusters = k_means_cluster(initial_clusters)

	pp initial_clusters
	pp new_clusters

	# Pick number of clusters (k)
	# Make k points (they're going to be the centroids)
	# Randomize all these points location
	# Calculate Euclidean distance from each point to all centroids
	# Assign 'membership' of each point to the nearest centroid
	# Establish the new centroids by averageing locations of all points belonging to a given cluster
	# Goto 4 Until convergence is achieved, or changes made are irrelevant.