Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active March 11, 2020 23:08
Show Gist options
  • Save BioSciEconomist/97dcadf99803fbfdcc69dc44c8e8ef24 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/97dcadf99803fbfdcc69dc44c8e8ef24 to your computer and use it in GitHub Desktop.
ex cluster analysis.R
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex cluster analysis.R
# | DATE: 3/6/20
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: basic mechanics of cluster analysis
# *----------------------------------------------------------------
### motivated by quick R https://www.statmethods.net/advstats/cluster.html
# rm(list=ls()) # get rid of any existing data
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(psych) # use data from psych library
#Loading the dataset
df = bfi
# Prepare Data
df <- bfi[,1:24] #select variables to include in fa
df<- na.omit(df) # listwise deletion of missing
#---------------------------------
# kmeans clustering
#--------------------------------
df <- scale(df) # standardize variables
# Determine number of clusters
wss <- (nrow(df)-1)*sum(apply(df,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(df,
centers=i,iter.max = 50)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# based on plot determine # of clusters (~ 6)
# K-Means Cluster Analysis
fit <- kmeans(df, 5) # 5 cluster solution
# get cluster means
aggregate(df,by=list(fit$cluster),FUN=mean)
# append cluster assignment
df <- data.frame(df, fit$cluster)
# Cluster Plot against 1st 2 principal components
# vary parameters for most readable graph
library(cluster)
clusplot(df, fit$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)
#----------------------------------------------
# hierarchical clustering
#----------------------------------------------
# Ward Hierarchical Clustering
d <- dist(df, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit) # display dendogram
groups <- cutree(fit, k=5) # cut tree into 5 clusters
# draw dendogram with red borders around the 5 clusters
rect.hclust(fit, k=5, border="red")
#--------------------------------------
# Model Based Clustering
#-------------------------------------
library(mclust)
fit <- Mclust(df)
plot(fit) # plot results
summary(fit) # display the best model
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment