Last active
March 11, 2020 23:08
-
-
Save BioSciEconomist/97dcadf99803fbfdcc69dc44c8e8ef24 to your computer and use it in GitHub Desktop.
ex cluster analysis.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex cluster analysis.R | |
# | DATE: 3/6/20 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: basic mechanics of cluster analysis | |
# *---------------------------------------------------------------- | |
### motivated by quick R https://www.statmethods.net/advstats/cluster.html | |
# rm(list=ls()) # get rid of any existing data | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(psych) # use data from psych library | |
#Loading the dataset | |
df = bfi | |
# Prepare Data | |
df <- bfi[,1:24] #select variables to include in fa | |
df<- na.omit(df) # listwise deletion of missing | |
#--------------------------------- | |
# kmeans clustering | |
#-------------------------------- | |
df <- scale(df) # standardize variables | |
# Determine number of clusters | |
wss <- (nrow(df)-1)*sum(apply(df,2,var)) | |
for (i in 2:15) wss[i] <- sum(kmeans(df, | |
centers=i,iter.max = 50)$withinss) | |
plot(1:15, wss, type="b", xlab="Number of Clusters", | |
ylab="Within groups sum of squares") | |
# based on plot determine # of clusters (~ 6) | |
# K-Means Cluster Analysis | |
fit <- kmeans(df, 5) # 5 cluster solution | |
# get cluster means | |
aggregate(df,by=list(fit$cluster),FUN=mean) | |
# append cluster assignment | |
df <- data.frame(df, fit$cluster) | |
# Cluster Plot against 1st 2 principal components | |
# vary parameters for most readable graph | |
library(cluster) | |
clusplot(df, fit$cluster, color=TRUE, shade=TRUE, | |
labels=2, lines=0) | |
#---------------------------------------------- | |
# hierarchical clustering | |
#---------------------------------------------- | |
# Ward Hierarchical Clustering | |
d <- dist(df, method = "euclidean") # distance matrix | |
fit <- hclust(d, method="ward") | |
plot(fit) # display dendogram | |
groups <- cutree(fit, k=5) # cut tree into 5 clusters | |
# draw dendogram with red borders around the 5 clusters | |
rect.hclust(fit, k=5, border="red") | |
#-------------------------------------- | |
# Model Based Clustering | |
#------------------------------------- | |
library(mclust) | |
fit <- Mclust(df) | |
plot(fit) # plot results | |
summary(fit) # display the best model | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment