Last active
January 22, 2016 15:44
-
-
Save maptracker/07328d13c143ae9b9e53 to your computer and use it in GitHub Desktop.
Exploring #kmeans stability in #R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Code was initially taken From Coursera lecture: | |
## https://www.coursera.org/learn/exploratory-data-analysis/lecture/6hOqi/k-means-clustering-part-2 | |
set.seed(1234) | |
## Generate 30 points randomly assorted around 3 centroids: (1,1), (2,2), (3,1) | |
numPoints <- 30 | |
x <- rnorm(numPoints, mean = rep(1:3, each = numPoints/3), sd = 0.2) | |
y <- rnorm(numPoints, mean = rep(c(1, 2, 1), each = numPoints/3), sd = 0.2) | |
## Organize as DF | |
dataFrame <- data.frame(x = x, y = y, row.names = 1:numPoints) | |
## Look for the clusters. Because we generated three clusters above, | |
## unambiguous results should be found when looking for 3. Trying | |
## other cluster numbers (searchFor) will result in different | |
## outcomes each time you re-run the analysis | |
clusterAndPlot <- function( searchFor ) { | |
kmeansObj <- kmeans(dataFrame, centers = searchFor) | |
## Plot and label the input | |
plot(x, y, col = kmeansObj$cluster, pch = 1, cex = 2) | |
text(x + 0.05, y + 0.05, labels = as.character(1:numPoints), col = "blue") | |
## Add the centroids | |
points(kmeansObj$centers, col = 1:searchFor, pch = 3, cex = 3, lwd = 3) | |
} | |
clusterAndPlot(3) | |
dev.new( width = 12 ) | |
par( mfrow = c(1,2) ) | |
trackInstability <- function( searchFor, iter = 100 ) { | |
## This function tracks how often specific k-means clusters | |
## appear. It generates a scatter plot with all observed | |
## centroids labeled, plus a bar plot showing the frequency of | |
## occurance of each particular cluster (based on its object | |
## membership) | |
## The goal here is to see how consistent clustering is for the | |
## data generated above. Do we always find the same clusters? If | |
## not, how often are given clusters observed? | |
hist <- data.frame() | |
lookup <- list() | |
nr <- nrow(dataFrame) | |
for (i in 1:iter) { | |
## Cluster the data | |
ko <- kmeans(dataFrame, centers = searchFor) | |
## Find object membership for each cluster, stored in a list | |
clust <- list() | |
for (j in 1:nr) { | |
cnum <- as.character(ko$cluster[j]) | |
clust[[ cnum ]] <- c(clust[[ cnum ]], j) | |
} | |
## Now turn each cluster into a unique key, and use that key | |
## to increment our tally list (counts) | |
for (k in 1:searchFor) { | |
cnum <- as.character(k) | |
key <- paste(sort(clust[[cnum]]), collapse = ",") | |
rowNum <- lookup[[ key ]] | |
if (is.null(rowNum)) { | |
# First time seeing this cluster | |
row <- data.frame(Count = 1, Members = key, | |
Size = length(clust[[cnum]]), | |
X = ko$centers[k,1], Y = ko$centers[k,2]) | |
hist <- rbind(hist, row) | |
lookup[[ key ]] <- nrow(hist) | |
} else { | |
hist$Count[ rowNum ] <- hist$Count[ rowNum ] + 1 | |
} | |
} | |
} | |
numc <- nrow(hist) | |
## Sort our clusters by size | |
hist <- hist[order(hist$Size), ] | |
hist$Name <- LETTERS[1:numc] | |
hist$Freq <- hist$Count / iter | |
## Plot the points | |
plot(dataFrame$x, dataFrame$y, col = "blue", pch = 1, cex = 1, | |
ylab = "", xlab = "") | |
text(x + 0.05, y + 0.05, labels = rownames(dataFrame), | |
col = "blue", cex = 0.6) | |
## Add the centroids, labeled by name | |
text(hist$X, hist$Y, labels = hist$Name, col = 1:numc, cex = 1, font = 2) | |
## I also wanted to add an ellipse defining the cluster, but the | |
## solution for that seems to involve the "car" package, which is | |
## pretty heavyweight: https://stackoverflow.com/a/26812165 | |
## Plot the data as a barchart, | |
## https://stackoverflow.com/a/4217305 # Get x-coordinate of each bar | |
bp <- barplot(hist$Freq, col = "white", names.arg = hist$Name, | |
border = 1:numc, ylab = "Frequency cluster is observed", | |
xlab = paste(c("Cluster Membership: centers =",searchFor), | |
collapse = " ")) | |
## Add a list of the members for each cluster | |
text(y = 0, x = bp - 0.3, labels = hist$Members, srt = 90, col = 1:numc, | |
pos = 4) | |
hist | |
} | |
ti <- trackInstability( 4, 300 ) |
Author
maptracker
commented
Jan 22, 2016
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment