BioSciEconomist · March 11, 2020 23:08
diff --git a/examples of cluster analysis in R b/examples of cluster analysis in R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex cluster analysis.R
 # | DATE: 3/6/20 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:        
 # *----------------------------------------------------------------
 # | PURPOSE: basic mechanics of cluster analysis
 # *----------------------------------------------------------------


 ### motivated by quick R https://www.statmethods.net/advstats/cluster.html

 # rm(list=ls()) # get rid of any existing data 

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 library(psych) # use data from psych library

 #Loading the dataset
 df = bfi

 # Prepare Data
 df <- bfi[,1:24]                                    #select variables to include in fa
 df<- na.omit(df) # listwise deletion of missing

 #---------------------------------
 # kmeans clustering
 #--------------------------------

 df <- scale(df) # standardize variables

 # Determine number of clusters
 wss <- (nrow(df)-1)*sum(apply(df,2,var))
 for (i in 2:15) wss[i] <- sum(kmeans(df,
                                     centers=i,iter.max = 50)$withinss)
 plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

 # based on plot determine # of clusters (~ 6)

 # K-Means Cluster Analysis
 fit <- kmeans(df, 5) # 5 cluster solution

 # get cluster means
 aggregate(df,by=list(fit$cluster),FUN=mean)

 # append cluster assignment
 df <- data.frame(df, fit$cluster)


 # Cluster Plot against 1st 2 principal components

 # vary parameters for most readable graph
 library(cluster)
 clusplot(df, fit$cluster, color=TRUE, shade=TRUE,
         labels=2, lines=0)

 #----------------------------------------------
 # hierarchical clustering
 #----------------------------------------------

 # Ward Hierarchical Clustering
 d <- dist(df, method = "euclidean") # distance matrix
 fit <- hclust(d, method="ward")
 plot(fit) # display dendogram
 groups <- cutree(fit, k=5) # cut tree into 5 clusters
 # draw dendogram with red borders around the 5 clusters
 rect.hclust(fit, k=5, border="red")

 #--------------------------------------
 # Model Based Clustering
 #-------------------------------------

 library(mclust)
 fit <- Mclust(df)
 plot(fit) # plot results
 summary(fit) # display the best model
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex cluster analysis.R
	# \| DATE: 3/6/20
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: basic mechanics of cluster analysis
	# *----------------------------------------------------------------


	### motivated by quick R https://www.statmethods.net/advstats/cluster.html

	# rm(list=ls()) # get rid of any existing data

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	library(psych) # use data from psych library

	#Loading the dataset
	df = bfi

	# Prepare Data
	df <- bfi[,1:24] #select variables to include in fa
	df<- na.omit(df) # listwise deletion of missing

	#---------------------------------
	# kmeans clustering
	#--------------------------------

	df <- scale(df) # standardize variables

	# Determine number of clusters
	wss <- (nrow(df)-1)*sum(apply(df,2,var))
	for (i in 2:15) wss[i] <- sum(kmeans(df,
	centers=i,iter.max = 50)$withinss)
	plot(1:15, wss, type="b", xlab="Number of Clusters",
	ylab="Within groups sum of squares")

	# based on plot determine # of clusters (~ 6)

	# K-Means Cluster Analysis
	fit <- kmeans(df, 5) # 5 cluster solution

	# get cluster means
	aggregate(df,by=list(fit$cluster),FUN=mean)

	# append cluster assignment
	df <- data.frame(df, fit$cluster)


	# Cluster Plot against 1st 2 principal components

	# vary parameters for most readable graph
	library(cluster)
	clusplot(df, fit$cluster, color=TRUE, shade=TRUE,
	labels=2, lines=0)

	#----------------------------------------------
	# hierarchical clustering
	#----------------------------------------------

	# Ward Hierarchical Clustering
	d <- dist(df, method = "euclidean") # distance matrix
	fit <- hclust(d, method="ward")
	plot(fit) # display dendogram
	groups <- cutree(fit, k=5) # cut tree into 5 clusters
	# draw dendogram with red borders around the 5 clusters
	rect.hclust(fit, k=5, border="red")

	#--------------------------------------
	# Model Based Clustering
	#-------------------------------------

	library(mclust)
	fit <- Mclust(df)
	plot(fit) # plot results
	summary(fit) # display the best model