Created
August 20, 2017 10:43
-
-
Save narulkargunjan/516637bfa9a5004290c05ed68efd0123 to your computer and use it in GitHub Desktop.
Provides simple code for K-means clustering with deciding the right K and scores the new dataset for the right clusters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#read data in r | |
iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv") | |
View(iris) | |
summary(iris) | |
head(iris) | |
# Randomise data for making little realistic | |
iris<-iris[sample(1:nrow(iris)),] | |
View(iris) | |
summary(iris) | |
head(iris) | |
# Plot data for general visualisation | |
library(ggplot2) | |
ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point() | |
# Remove the species variable so that it does not get included as classification var | |
iris1 <- iris | |
iris1$Species <- NULL | |
View(iris1) | |
iris1_scale<-scale(iris1[,2:ncol(iris1)]) | |
View(iris1_scale) | |
# Run kmeans clustering | |
km.out <- kmeans(iris1_scale, centers=3, nstart=20) | |
summary(km.out) | |
print(km.out) | |
# confusion matrix | |
table(iris$Species,km.out$cluster) | |
# Other metrics | |
# Overall R square | |
Overall_R_sq<-km.out$betweenss/km.out$totss | |
Overall_R_sq | |
# Plots | |
plot(iris[c("Petal_Length", "Petal_Width")],col=km.out$cluster | |
,main = "k-means with 3 clusters") | |
plot(iris[c("Petal_Length", "Petal_Width")],col=iris$Species) | |
plot(iris[c("Sepal_Length", "Sepal_Width")],col=km.out$cluster) | |
# Use of nstart | |
# Set up 2 x 3 plotting grid | |
par(mfrow = c(2, 3)) | |
# Set seed | |
set.seed(2) | |
for(i in 1:6) { | |
# Run kmeans() on x with three clusters and one start | |
km.out1 <- kmeans(iris1_scale, centers = 3, nstart = 1) | |
# Plot clusters | |
plot(iris1[c("Sepal_Length", "Sepal_Width")], col = km.out1$cluster, main = km.out1$tot.withinss) | |
} | |
# Finding optimal number of clusters | |
# Initialize total within sum of squares error: wss | |
wss <- 0 | |
# For 1 to 15 cluster centers | |
for (i in 1:15) { | |
km.out2 <- kmeans(iris1_scale, centers = i, nstart = 20) | |
# Save total within sum of squares to wss variable | |
wss[i] <- km.out2$tot.withinss | |
} | |
par(mfrow = c(1, 1)) | |
# Plot total within sum of squares vs. number of clusters | |
plot(1:15, wss, type = "b", | |
xlab = "Number of Clusters", | |
ylab = "Within groups sum of squares") | |
# ------------------------------------------------------------------ | |
# Apply the clustering to new data | |
iris_test <- read.csv("C:/Users/Ashwin/Desktop/segmentation/Fishers Iris Data -for scoring.csv") | |
head(iris_test) | |
# Randomise data for making little realistic | |
iris_test<-iris_test[sample(1:nrow(iris)),] | |
View(iris_test) | |
summary(iris_test) | |
head(iris_test) | |
# Plot data for general visualisation | |
library(ggplot2) | |
ggplot(iris_test, aes(Petal_Length, Petal_Width, color = Species)) + geom_point() | |
ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point() | |
# Remove the species variable so that it does not get included as classification var | |
iris_test1 <- iris_test | |
iris_test1$Species <- NULL | |
View(iris_test1) | |
iris_test1_scale<-scale(iris1[,2:ncol(iris1)]) | |
# Creating closest cluster function | |
closest.cluster <- function(x) { | |
cluster.dist <- apply(km.out$centers, 1, function(y) sqrt(sum((x-y)^2))) | |
return(which.min(cluster.dist)[1]) | |
} | |
clusters2 <- apply(iris_test1_scale, 1, closest.cluster) | |
clusters2 | |
clusters2.df<-data.frame(clusters2) | |
View(clusters2.df) | |
# Attaching clusters information to original without clusters data | |
iris_test2<-merge(x=iris_test1,y=clusters2) | |
View(iris_test2) | |
install.packages("NbClust") | |
library(NbClust) | |
I_ccc <- NbClust(iris1, distance="euclidean", min.nc=2, max.nc=15, method = "kmeans", | |
index = "ccc") | |
I_ccc | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment