tdunning · August 29, 2015 14:12
diff --git a/Clustering is hard b/Clustering is hard
 # picking the corners of the hyper cube at random usually gives us a good selection
 d = 0
 while (d == 0) {
    centers = matrix(runif(10*10)>0.5, ncol=10) + 0
    # but occasionally we get a duplicate row that is easily detected
    d = det(centers)
 }

 # start x out by selecting clusters
 x = data.frame(n = ceiling(runif(10000,1e-10,10)))
 for (i in 1:10) {
    # then put in the coordinate of each column with a bit of noise
    x = cbind(x, centers[x$n,i] + rnorm(dim(x)[1], 0, 1e-3))
 }
 names(x) = c("n", paste("V", 1:10, sep=""))

 # then cluster and plot.  Ideally, all counts will be nearly equal.
 fail = 0
 success = 0
 counts = rep(0,100)
 for (i in 1:100) {
    k = kmeans(x[,2:11], centers=10, nstart=1)
    cnt = colSums(table(k$cluster, x$n) > 0)
    counts[i] = max(cnt)
    if (any(cnt > 1)) {
        fail = fail + 1
    } else {
        success = success + 1
    }
 }
 print(list(fail = fail, success = success))
 print(table(counts))
	# picking the corners of the hyper cube at random usually gives us a good selection
	d = 0
	while (d == 0) {
	centers = matrix(runif(10*10)>0.5, ncol=10) + 0
	# but occasionally we get a duplicate row that is easily detected
	d = det(centers)
	}

	# start x out by selecting clusters
	x = data.frame(n = ceiling(runif(10000,1e-10,10)))
	for (i in 1:10) {
	# then put in the coordinate of each column with a bit of noise
	x = cbind(x, centers[x$n,i] + rnorm(dim(x)[1], 0, 1e-3))
	}
	names(x) = c("n", paste("V", 1:10, sep=""))

	# then cluster and plot. Ideally, all counts will be nearly equal.
	fail = 0
	success = 0
	counts = rep(0,100)
	for (i in 1:100) {
	k = kmeans(x[,2:11], centers=10, nstart=1)
	cnt = colSums(table(k$cluster, x$n) > 0)
	counts[i] = max(cnt)
	if (any(cnt > 1)) {
	fail = fail + 1
	} else {
	success = success + 1
	}
	}
	print(list(fail = fail, success = success))
	print(table(counts))