eminaksehirli · February 13, 2015 23:46
diff --git a/kmeans.m b/kmeans.m
 %    This program is free software: you can redistribute it and/or modify
 %    it under the terms of the GNU Affero General Public License as 
 %    published by the Free Software Foundation, either version 3 of the 
 %    License, or any later version.
 %
 %    This program is distributed in the hope that it will be useful,
 %    but WITHOUT ANY WARRANTY; without even the implied warranty of
 %    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 %    GNU Affero General Public License for more details. It can be found
 %    at <http://www.gnu.org/licenses/>.
 %
 %==========================================================================
 % FUNCTION: [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
 % DESCRIPTION: This function performs k-means clustering algorithm on a given 
 %              dataset. 
 %
 % INPUTS:   X = N*d matrix of dataset, rows of X correspond to N data points; 
 %               columns correspond to attributes.
 %           k = number of clusters
 %     centers = Starting centers of clusters.
 %     maxiter = Maximum iteration count for convergence.
 %
 % OUTPUTS: assignments = Integer vector that holds 
 %==========================================================================
 % copyright (c) 2010 M. Emin Aksehirli
 %==========================================================================
 function  [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
 	if (centers == 0)
 		centerRows = randperm(size(X)(1));
 		centers = X(centerRows(1:k), :);
 	endif
 	numOfRows = length(X(:,1));
 	numOfFeatures = length(X(1,:));
 	assignments = ones(1, numOfRows);

 	for iter = 1:maxiter
 		clusterTotals = zeros(k, numOfFeatures);
 		clusterSizes = zeros(k, 1);
 		for rowIx = 1:numOfRows
 			minDist = realmax;
 			assignTo = 0;
 			for centerIx = 1:k 
 				% Euclidian distance is used.
 				dist = sqrt(sum((X(rowIx, : ) - centers(centerIx, :)).^2));
 				if dist < minDist
 					minDist = dist;
 					assignTo = centerIx;
 				endif
 			endfor
 			assignments(rowIx) = assignTo;

 			% Keep these information to calculate cluster centers.
 			clusterTotals(assignTo, :) += X(rowIx, :);
 			clusterSizes(assignTo)++;
 		endfor

 		% This process is called 'singleton' in terms of Matlab. 
 		% If a cluster is empty choose a random data point as new 
 		% cluster cener.
 		for clusterIx = 1:k
 			if (clusterSizes(clusterIx) == 0)
 				randomRow = round(1 + rand() * (numOfRows - 1) );
 				clusterTotals(clusterIx, :) =  X(randomRow, :);
 				clusterSizes(clusterIx) = 1;
 			endif
 		endfor

 		newCenters = zeros(k, numOfFeatures);
 		for centerIx = 1:k 
 			newCenters(centerIx, :) = clusterTotals(centerIx, : ) / clusterSizes(centerIx);
 		endfor
 	
 		diff = sum(sum(abs(newCenters - centers)));
 	
 		if diff < eps
 			%disp('Centers are same, which means we converged before maxiteration count. This is a good thing!')
 			break;
 		endif
 	
 		centers = newCenters;
 	endfor	
 	assignments = assignments';
 	%printf('iter: %d, diff: %f\n', iter, diff);
 endfunction
	% This program is free software: you can redistribute it and/or modify
	% it under the terms of the GNU Affero General Public License as
	% published by the Free Software Foundation, either version 3 of the
	% License, or any later version.
	%
	% This program is distributed in the hope that it will be useful,
	% but WITHOUT ANY WARRANTY; without even the implied warranty of
	% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	% GNU Affero General Public License for more details. It can be found
	% at <http://www.gnu.org/licenses/>.
	%
	%==========================================================================
	% FUNCTION: [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
	% DESCRIPTION: This function performs k-means clustering algorithm on a given
	% dataset.
	%
	% INPUTS: X = N*d matrix of dataset, rows of X correspond to N data points;
	% columns correspond to attributes.
	% k = number of clusters
	% centers = Starting centers of clusters.
	% maxiter = Maximum iteration count for convergence.
	%
	% OUTPUTS: assignments = Integer vector that holds
	%==========================================================================
	% copyright (c) 2010 M. Emin Aksehirli
	%==========================================================================
	function [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
	if (centers == 0)
	centerRows = randperm(size(X)(1));
	centers = X(centerRows(1:k), :);
	endif
	numOfRows = length(X(:,1));
	numOfFeatures = length(X(1,:));
	assignments = ones(1, numOfRows);

	for iter = 1:maxiter
	clusterTotals = zeros(k, numOfFeatures);
	clusterSizes = zeros(k, 1);
	for rowIx = 1:numOfRows
	minDist = realmax;
	assignTo = 0;
	for centerIx = 1:k
	% Euclidian distance is used.
	dist = sqrt(sum((X(rowIx, : ) - centers(centerIx, :)).^2));
	if dist < minDist
	minDist = dist;
	assignTo = centerIx;
	endif
	endfor
	assignments(rowIx) = assignTo;

	% Keep these information to calculate cluster centers.
	clusterTotals(assignTo, :) += X(rowIx, :);
	clusterSizes(assignTo)++;
	endfor

	% This process is called 'singleton' in terms of Matlab.
	% If a cluster is empty choose a random data point as new
	% cluster cener.
	for clusterIx = 1:k
	if (clusterSizes(clusterIx) == 0)
	randomRow = round(1 + rand() * (numOfRows - 1) );
	clusterTotals(clusterIx, :) = X(randomRow, :);
	clusterSizes(clusterIx) = 1;
	endif
	endfor

	newCenters = zeros(k, numOfFeatures);
	for centerIx = 1:k
	newCenters(centerIx, :) = clusterTotals(centerIx, : ) / clusterSizes(centerIx);
	endfor

	diff = sum(sum(abs(newCenters - centers)));

	if diff < eps
	%disp('Centers are same, which means we converged before maxiteration count. This is a good thing!')
	break;
	endif

	centers = newCenters;
	endfor
	assignments = assignments';
	%printf('iter: %d, diff: %f\n', iter, diff);
	endfunction