Skip to content

Instantly share code, notes, and snippets.

@narulkargunjan
Created August 20, 2017 10:43
Show Gist options
  • Save narulkargunjan/516637bfa9a5004290c05ed68efd0123 to your computer and use it in GitHub Desktop.
Save narulkargunjan/516637bfa9a5004290c05ed68efd0123 to your computer and use it in GitHub Desktop.
Provides simple code for K-means clustering with deciding the right K and scores the new dataset for the right clusters.
#read data in r
iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv")
View(iris)
summary(iris)
head(iris)
# Randomise data for making little realistic
iris<-iris[sample(1:nrow(iris)),]
View(iris)
summary(iris)
head(iris)
# Plot data for general visualisation
library(ggplot2)
ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()
# Remove the species variable so that it does not get included as classification var
iris1 <- iris
iris1$Species <- NULL
View(iris1)
iris1_scale<-scale(iris1[,2:ncol(iris1)])
View(iris1_scale)
# Run kmeans clustering
km.out <- kmeans(iris1_scale, centers=3, nstart=20)
summary(km.out)
print(km.out)
# confusion matrix
table(iris$Species,km.out$cluster)
# Other metrics
# Overall R square
Overall_R_sq<-km.out$betweenss/km.out$totss
Overall_R_sq
# Plots
plot(iris[c("Petal_Length", "Petal_Width")],col=km.out$cluster
,main = "k-means with 3 clusters")
plot(iris[c("Petal_Length", "Petal_Width")],col=iris$Species)
plot(iris[c("Sepal_Length", "Sepal_Width")],col=km.out$cluster)
# Use of nstart
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
# Set seed
set.seed(2)
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out1 <- kmeans(iris1_scale, centers = 3, nstart = 1)
# Plot clusters
plot(iris1[c("Sepal_Length", "Sepal_Width")], col = km.out1$cluster, main = km.out1$tot.withinss)
}
# Finding optimal number of clusters
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out2 <- kmeans(iris1_scale, centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out2$tot.withinss
}
par(mfrow = c(1, 1))
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# ------------------------------------------------------------------
# Apply the clustering to new data
iris_test <- read.csv("C:/Users/Ashwin/Desktop/segmentation/Fishers Iris Data -for scoring.csv")
head(iris_test)
# Randomise data for making little realistic
iris_test<-iris_test[sample(1:nrow(iris)),]
View(iris_test)
summary(iris_test)
head(iris_test)
# Plot data for general visualisation
library(ggplot2)
ggplot(iris_test, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()
ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()
# Remove the species variable so that it does not get included as classification var
iris_test1 <- iris_test
iris_test1$Species <- NULL
View(iris_test1)
iris_test1_scale<-scale(iris1[,2:ncol(iris1)])
# Creating closest cluster function
closest.cluster <- function(x) {
cluster.dist <- apply(km.out$centers, 1, function(y) sqrt(sum((x-y)^2)))
return(which.min(cluster.dist)[1])
}
clusters2 <- apply(iris_test1_scale, 1, closest.cluster)
clusters2
clusters2.df<-data.frame(clusters2)
View(clusters2.df)
# Attaching clusters information to original without clusters data
iris_test2<-merge(x=iris_test1,y=clusters2)
View(iris_test2)
install.packages("NbClust")
library(NbClust)
I_ccc <- NbClust(iris1, distance="euclidean", min.nc=2, max.nc=15, method = "kmeans",
index = "ccc")
I_ccc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment