narulkargunjan · August 20, 2017 10:43
diff --git a/K_Means_Clustering.R b/K_Means_Clustering.R
 #read data in r

 iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv")
 View(iris)
 summary(iris)
 head(iris)

 # Randomise data for making little realistic

 iris<-iris[sample(1:nrow(iris)),]
 View(iris)
 summary(iris)
 head(iris)

 # Plot data for general visualisation

 library(ggplot2)
 ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()

 # Remove the species variable so that it does not get included as classification var

 iris1 <- iris
 iris1$Species <- NULL
 View(iris1)
 iris1_scale<-scale(iris1[,2:ncol(iris1)])
 View(iris1_scale)

 # Run kmeans clustering

 km.out <- kmeans(iris1_scale, centers=3, nstart=20)
 summary(km.out)
 print(km.out)

 # confusion matrix
 table(iris$Species,km.out$cluster)


 # Other metrics

 # Overall R square
 Overall_R_sq<-km.out$betweenss/km.out$totss
 Overall_R_sq  
  
  

 # Plots

 plot(iris[c("Petal_Length", "Petal_Width")],col=km.out$cluster
     ,main = "k-means with 3 clusters")
 plot(iris[c("Petal_Length", "Petal_Width")],col=iris$Species)
 plot(iris[c("Sepal_Length", "Sepal_Width")],col=km.out$cluster)

 # Use of nstart

 # Set up 2 x 3 plotting grid

 par(mfrow = c(2, 3))
 # Set seed
 set.seed(2)
 for(i in 1:6) {
 # Run kmeans() on x with three clusters and one start
 km.out1 <- kmeans(iris1_scale, centers = 3, nstart = 1)
 # Plot clusters
 plot(iris1[c("Sepal_Length", "Sepal_Width")], col = km.out1$cluster, main = km.out1$tot.withinss)
 }


 # Finding optimal number of clusters

 # Initialize total within sum of squares error: wss
 wss <- 0

 # For 1 to 15 cluster centers
 for (i in 1:15) {
  km.out2 <- kmeans(iris1_scale, centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out2$tot.withinss
 }

 par(mfrow = c(1, 1))

 # Plot total within sum of squares vs. number of clusters
 plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")



 # ------------------------------------------------------------------


 # Apply the clustering to new data

 iris_test <- read.csv("C:/Users/Ashwin/Desktop/segmentation/Fishers Iris Data -for scoring.csv")
 head(iris_test)

 # Randomise data for making little realistic

 iris_test<-iris_test[sample(1:nrow(iris)),]
 View(iris_test)
 summary(iris_test)
 head(iris_test)

 # Plot data for general visualisation

 library(ggplot2)
 ggplot(iris_test, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()
 ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()

 # Remove the species variable so that it does not get included as classification var

 iris_test1 <- iris_test
 iris_test1$Species <- NULL
 View(iris_test1)
 iris_test1_scale<-scale(iris1[,2:ncol(iris1)])

 # Creating closest cluster function

 closest.cluster <- function(x) {
  cluster.dist <- apply(km.out$centers, 1, function(y) sqrt(sum((x-y)^2)))
  return(which.min(cluster.dist)[1])
 }


 clusters2 <- apply(iris_test1_scale, 1, closest.cluster)
 clusters2
 clusters2.df<-data.frame(clusters2)
 View(clusters2.df)
 # Attaching clusters information to original without clusters data
 iris_test2<-merge(x=iris_test1,y=clusters2)
 View(iris_test2)


 install.packages("NbClust")
 library(NbClust)
 I_ccc <- NbClust(iris1, distance="euclidean", min.nc=2, max.nc=15, method = "kmeans",
                 index = "ccc")
 I_ccc
	#read data in r

	iris <- read.csv("C:/Users/Ashwin/Desktop/segmentation/CSV Fishers Iris Data.csv")
	View(iris)
	summary(iris)
	head(iris)

	# Randomise data for making little realistic

	iris<-iris[sample(1:nrow(iris)),]
	View(iris)
	summary(iris)
	head(iris)

	# Plot data for general visualisation

	library(ggplot2)
	ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()

	# Remove the species variable so that it does not get included as classification var

	iris1 <- iris
	iris1$Species <- NULL
	View(iris1)
	iris1_scale<-scale(iris1[,2:ncol(iris1)])
	View(iris1_scale)

	# Run kmeans clustering

	km.out <- kmeans(iris1_scale, centers=3, nstart=20)
	summary(km.out)
	print(km.out)

	# confusion matrix
	table(iris$Species,km.out$cluster)


	# Other metrics

	# Overall R square
	Overall_R_sq<-km.out$betweenss/km.out$totss
	Overall_R_sq



	# Plots

	plot(iris[c("Petal_Length", "Petal_Width")],col=km.out$cluster
	,main = "k-means with 3 clusters")
	plot(iris[c("Petal_Length", "Petal_Width")],col=iris$Species)
	plot(iris[c("Sepal_Length", "Sepal_Width")],col=km.out$cluster)

	# Use of nstart

	# Set up 2 x 3 plotting grid

	par(mfrow = c(2, 3))
	# Set seed
	set.seed(2)
	for(i in 1:6) {
	# Run kmeans() on x with three clusters and one start
	km.out1 <- kmeans(iris1_scale, centers = 3, nstart = 1)
	# Plot clusters
	plot(iris1[c("Sepal_Length", "Sepal_Width")], col = km.out1$cluster, main = km.out1$tot.withinss)
	}


	# Finding optimal number of clusters

	# Initialize total within sum of squares error: wss
	wss <- 0

	# For 1 to 15 cluster centers
	for (i in 1:15) {
	km.out2 <- kmeans(iris1_scale, centers = i, nstart = 20)
	# Save total within sum of squares to wss variable
	wss[i] <- km.out2$tot.withinss
	}

	par(mfrow = c(1, 1))

	# Plot total within sum of squares vs. number of clusters
	plot(1:15, wss, type = "b",
	xlab = "Number of Clusters",
	ylab = "Within groups sum of squares")



	# ------------------------------------------------------------------


	# Apply the clustering to new data

	iris_test <- read.csv("C:/Users/Ashwin/Desktop/segmentation/Fishers Iris Data -for scoring.csv")
	head(iris_test)

	# Randomise data for making little realistic

	iris_test<-iris_test[sample(1:nrow(iris)),]
	View(iris_test)
	summary(iris_test)
	head(iris_test)

	# Plot data for general visualisation

	library(ggplot2)
	ggplot(iris_test, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()
	ggplot(iris, aes(Petal_Length, Petal_Width, color = Species)) + geom_point()

	# Remove the species variable so that it does not get included as classification var

	iris_test1 <- iris_test
	iris_test1$Species <- NULL
	View(iris_test1)
	iris_test1_scale<-scale(iris1[,2:ncol(iris1)])

	# Creating closest cluster function

	closest.cluster <- function(x) {
	cluster.dist <- apply(km.out$centers, 1, function(y) sqrt(sum((x-y)^2)))
	return(which.min(cluster.dist)[1])
	}


	clusters2 <- apply(iris_test1_scale, 1, closest.cluster)
	clusters2
	clusters2.df<-data.frame(clusters2)
	View(clusters2.df)
	# Attaching clusters information to original without clusters data
	iris_test2<-merge(x=iris_test1,y=clusters2)
	View(iris_test2)


	install.packages("NbClust")
	library(NbClust)
	I_ccc <- NbClust(iris1, distance="euclidean", min.nc=2, max.nc=15, method = "kmeans",
	index = "ccc")
	I_ccc