Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active March 11, 2020 15:36
Show Gist options
  • Save BioSciEconomist/1fdcda1fbabca9100a5f0ed8a08826a1 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/1fdcda1fbabca9100a5f0ed8a08826a1 to your computer and use it in GitHub Desktop.
Basic factor analysis examples
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex factor analysis.R
# | DATE: 3/6/20
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: basic mechanics of factor analysis
# *----------------------------------------------------------------
# rm(list=ls()) # get rid of any existing data
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(psych)
#---------------------------------
# example 1 basic cluster analysis
#---------------------------------
#Loading the dataset
bfi_data = bfi
# Remove rows with missing values and keep only complete cases
bfi_data=bfi_data[complete.cases(bfi_data),]
# write.csv(bfi_data,"bfi_data.csv")
# Create the correlation matrix from bfi_data
bfi_cor <- cor(bfi_data)
#Factor analysis of the data
factors_data <- fa(r = bfi_cor, nfactors = 6, scores = TRUE)
print(factors_data)
#---------------------------------
# example 2 adding factor scores to data set
#---------------------------------
df <- bfi[,1:24] #select variables to include in fa
fit <- fa(df, 2) #estimate model with 2 factors
fs <- factor.scores(df, fit) #obtain factor scores
fs <- fs$scores #get the columns of factor scores for each case
df <- cbind(df,fs) #append factor scores to dataset
#---------------------------------
# example 3 adding factor scores to data set
#---------------------------------
df <- bfi[,1:24] #select variables to include in fa
df <- df[complete.cases(df),] #get complete cases
df_cor <- cor(df) # compute correlation matrix
fit <- fa(r = df_cor, nfactors = 6, scores = TRUE) # factor analysis
fs <- factor.scores(df, fit) #obtain factor scores
fs <- fs$scores #get the columns of factor scores for each case
df <- cbind(df,fs) #append factor scores to dataset
#----------------------------------
# example 4 rotation and plots
#--------------------------------
# Maximum Likelihood Factor Analysis
# with varimax rotation
df <- bfi[,1:24] #select variables to include in fa
df <- df[complete.cases(df),] #get complete cases
fit <- factanal(df, 3, rotation="varimax") # perform fa
print(fit, digits=2, cutoff=.3, sort=TRUE) # print 'easy to read' loadings
# plot factor 1 by factor 2
load <- fit$loadings[,1:2]
plot(load,type="n") # set up plot
text(load,labels=names(df),cex=.7) # add variable names
#--------------------------------
# example 5 determining # of factors
#------------------------------
library(nFactors)
df <- bfi[,1:24] #select variables to include in fa
df <- df[complete.cases(df),] #get complete cases
ev <- eigen(cor(df)) # get eigenvalues
ap <- parallel(subject=nrow(df),var=ncol(df),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
#---------------------------------
# example 6 end to end analysis with clustering
#--------------------------------
# rm(list=ls()) # get rid of any existing data
# step 1 determine number of factors in solution
library(nFactors)
df <- bfi[,1:24] #select variables to include in fa
df <- df[complete.cases(df),] #get complete cases
ev <- eigen(cor(df)) # get eigenvalues
ap <- parallel(subject=nrow(df),var=ncol(df),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)
# from this result ~ 6 factors
# step 2 apply fa and add to dataset
df <- bfi[,1:24] #select variables to include in fa
df <- df[complete.cases(df),] #get complete cases
df_cor <- cor(df) # compute correlation matrix
fit <- fa(r = df_cor, nfactors = 6, scores = TRUE) # factor analysis
print(fit, digits=2, cutoff=.3, sort=TRUE) # print 'easy to read' loadings
fs <- factor.scores(df, fit) #obtain factor scores
fs <- fs$scores #get the columns of factor scores for each case
df <- cbind(df,fs) #append factor scores to dataset
# step 3 apply cluster analysis to the factors
tmp <- df[c("MR1","MR2","MR3","MR4","MR5","MR6")]
tmp <- scale(tmp) # standardize variables
# Determine number of clusters
wss <- (nrow(df)-1)*sum(apply(tmp,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(tmp,
centers=i,iter.max = 50)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# based on plot determine # of clusters (~ 6)
# K-Means Cluster Analysis
fit <- kmeans(tmp, 5) # 5 cluster solution
# get cluster means
aggregate(tmp,by=list(fit$cluster),FUN=mean)
# append cluster assignment
df <- data.frame(df, fit$cluster)
#-----------------------------------
# calculate principal component loadings by hand
#----------------------------------
library(psych)
#Loading the dataset
bfi_data = bfi
# Remove rows with missing values and keep only complete cases
bfi_data=bfi_data[complete.cases(bfi_data),]
# write.csv(bfi_data,"bfi_data.csv")
# Create the correlation matrix from bfi_data
bfi_cor <- cor(bfi_data)
# perform PCA
pc_bfi <- principal(bfi_cor, nfactors = 28, rotate = "none")
pc_bfi$loadings
# calculate eigenvectors and values 'by hand'
ev <- eigen(bfi_cor) # calculate eigenvalues and vectors
values <- ev$values # get eigenvalues
vectors <- ev$vectors # get eigenvectors
# loadings = vector values * sqrt(eigenvalues)
#ex: calculate loadings for 1st PC
(vectors[,1]*sqrt(values[1])) # 1st loading
pc_bfi$loadings # compare to results above from pca routine
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment