BioSciEconomist · March 11, 2020 15:36
diff --git a/ex factor analysis.R b/ex factor analysis.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex factor analysis.R
 # | DATE: 3/6/20 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:        
 # *----------------------------------------------------------------
 # | PURPOSE: basic mechanics of factor analysis
 # *----------------------------------------------------------------


 # rm(list=ls()) # get rid of any existing data 

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 library(psych)

 #---------------------------------
 # example 1 basic cluster analysis
 #---------------------------------

 #Loading the dataset
 bfi_data = bfi

 # Remove rows with missing values and keep only complete cases

 bfi_data=bfi_data[complete.cases(bfi_data),]

 # write.csv(bfi_data,"bfi_data.csv")

 # Create the correlation matrix from bfi_data
 bfi_cor <- cor(bfi_data)

 #Factor analysis of the data
 factors_data <- fa(r = bfi_cor, nfactors = 6, scores = TRUE)
 print(factors_data)


 #---------------------------------
 # example 2 adding factor scores to data set
 #---------------------------------

 df <- bfi[,1:24]                 #select variables to include in fa
 fit <- fa(df, 2)                 #estimate model with 2 factors
 fs <- factor.scores(df, fit)     #obtain factor scores
 fs <- fs$scores                   #get the columns of factor scores for each case
 df <- cbind(df,fs)              #append factor scores to dataset 


 #---------------------------------
 # example 3 adding factor scores to data set
 #---------------------------------

 df <- bfi[,1:24]                                    #select variables to include in fa
 df <- df[complete.cases(df),]                       #get complete cases
 df_cor <- cor(df)                                   # compute correlation matrix
 fit <- fa(r = df_cor, nfactors = 6, scores = TRUE)  # factor analysis
 fs <- factor.scores(df, fit)                        #obtain factor scores
 fs <- fs$scores                                     #get the columns of factor scores for each case
 df <- cbind(df,fs)                                  #append factor scores to dataset 


 #----------------------------------
 # example 4 rotation and plots
 #--------------------------------


 # Maximum Likelihood Factor Analysis
 # with varimax rotation

 df <- bfi[,1:24]                                    #select variables to include in fa
 df <- df[complete.cases(df),]                       #get complete cases
 fit <- factanal(df, 3, rotation="varimax")          # perform fa
 print(fit, digits=2, cutoff=.3, sort=TRUE)          # print 'easy to read'  loadings
 # plot factor 1 by factor 2
 load <- fit$loadings[,1:2]
 plot(load,type="n") # set up plot
 text(load,labels=names(df),cex=.7) # add variable names


 #--------------------------------
 # example 5 determining # of factors
 #------------------------------


 library(nFactors)

 df <- bfi[,1:24]  #select variables to include in fa
 df <- df[complete.cases(df),]  #get complete cases
 ev <- eigen(cor(df)) # get eigenvalues
 ap <- parallel(subject=nrow(df),var=ncol(df),
               rep=100,cent=.05)
 nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
 plotnScree(nS)



 #---------------------------------
 # example 6 end to end analysis with clustering
 #--------------------------------

 # rm(list=ls()) # get rid of any existing data 

 # step 1 determine number of factors in solution

 library(nFactors)

 df <- bfi[,1:24]  #select variables to include in fa
 df <- df[complete.cases(df),]  #get complete cases
 ev <- eigen(cor(df)) # get eigenvalues
 ap <- parallel(subject=nrow(df),var=ncol(df),
               rep=100,cent=.05)
 nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
 plotnScree(nS)

 # from this result ~ 6 factors

 # step 2 apply fa and add to dataset

 df <- bfi[,1:24]                                    #select variables to include in fa
 df <- df[complete.cases(df),]                       #get complete cases
 df_cor <- cor(df)                                   # compute correlation matrix
 fit <- fa(r = df_cor, nfactors = 6, scores = TRUE)  # factor analysis
 print(fit, digits=2, cutoff=.3, sort=TRUE)          # print 'easy to read'  loadings
 fs <- factor.scores(df, fit)                        #obtain factor scores
 fs <- fs$scores                                     #get the columns of factor scores for each case
 df <- cbind(df,fs)                                  #append factor scores to dataset 


 # step 3  apply cluster analysis to the factors

 tmp <- df[c("MR1","MR2","MR3","MR4","MR5","MR6")]
 tmp <- scale(tmp) # standardize variables

 # Determine number of clusters
 wss <- (nrow(df)-1)*sum(apply(tmp,2,var))
 for (i in 2:15) wss[i] <- sum(kmeans(tmp,
                                     centers=i,iter.max = 50)$withinss)
 plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

 # based on plot determine # of clusters (~ 6)

 # K-Means Cluster Analysis
 fit <- kmeans(tmp, 5) # 5 cluster solution

 # get cluster means
 aggregate(tmp,by=list(fit$cluster),FUN=mean)

 # append cluster assignment
 df <- data.frame(df, fit$cluster)

 #-----------------------------------
 # calculate principal component loadings by hand
 #----------------------------------

 library(psych)

 #Loading the dataset
 bfi_data = bfi

 # Remove rows with missing values and keep only complete cases

 bfi_data=bfi_data[complete.cases(bfi_data),]

 # write.csv(bfi_data,"bfi_data.csv")

 # Create the correlation matrix from bfi_data
 bfi_cor <- cor(bfi_data)

 # perform PCA
 pc_bfi <- principal(bfi_cor, nfactors  = 28, rotate = "none")
 pc_bfi$loadings


 # calculate eigenvectors and values 'by hand'
 ev <- eigen(bfi_cor) # calculate eigenvalues and vectors
 values <- ev$values # get eigenvalues
 vectors <- ev$vectors # get eigenvectors

 # loadings = vector values * sqrt(eigenvalues)
 #ex: calculate loadings for 1st PC

 (vectors[,1]*sqrt(values[1])) # 1st loading

 pc_bfi$loadings # compare to results above from pca routine
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex factor analysis.R
	# \| DATE: 3/6/20
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: basic mechanics of factor analysis
	# *----------------------------------------------------------------


	# rm(list=ls()) # get rid of any existing data

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	library(psych)

	#---------------------------------
	# example 1 basic cluster analysis
	#---------------------------------

	#Loading the dataset
	bfi_data = bfi

	# Remove rows with missing values and keep only complete cases

	bfi_data=bfi_data[complete.cases(bfi_data),]

	# write.csv(bfi_data,"bfi_data.csv")

	# Create the correlation matrix from bfi_data
	bfi_cor <- cor(bfi_data)

	#Factor analysis of the data
	factors_data <- fa(r = bfi_cor, nfactors = 6, scores = TRUE)
	print(factors_data)


	#---------------------------------
	# example 2 adding factor scores to data set
	#---------------------------------

	df <- bfi[,1:24] #select variables to include in fa
	fit <- fa(df, 2) #estimate model with 2 factors
	fs <- factor.scores(df, fit) #obtain factor scores
	fs <- fs$scores #get the columns of factor scores for each case
	df <- cbind(df,fs) #append factor scores to dataset


	#---------------------------------
	# example 3 adding factor scores to data set
	#---------------------------------

	df <- bfi[,1:24] #select variables to include in fa
	df <- df[complete.cases(df),] #get complete cases
	df_cor <- cor(df) # compute correlation matrix
	fit <- fa(r = df_cor, nfactors = 6, scores = TRUE) # factor analysis
	fs <- factor.scores(df, fit) #obtain factor scores
	fs <- fs$scores #get the columns of factor scores for each case
	df <- cbind(df,fs) #append factor scores to dataset


	#----------------------------------
	# example 4 rotation and plots
	#--------------------------------


	# Maximum Likelihood Factor Analysis
	# with varimax rotation

	df <- bfi[,1:24] #select variables to include in fa
	df <- df[complete.cases(df),] #get complete cases
	fit <- factanal(df, 3, rotation="varimax") # perform fa
	print(fit, digits=2, cutoff=.3, sort=TRUE) # print 'easy to read' loadings
	# plot factor 1 by factor 2
	load <- fit$loadings[,1:2]
	plot(load,type="n") # set up plot
	text(load,labels=names(df),cex=.7) # add variable names


	#--------------------------------
	# example 5 determining # of factors
	#------------------------------


	library(nFactors)

	df <- bfi[,1:24] #select variables to include in fa
	df <- df[complete.cases(df),] #get complete cases
	ev <- eigen(cor(df)) # get eigenvalues
	ap <- parallel(subject=nrow(df),var=ncol(df),
	rep=100,cent=.05)
	nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
	plotnScree(nS)



	#---------------------------------
	# example 6 end to end analysis with clustering
	#--------------------------------

	# rm(list=ls()) # get rid of any existing data

	# step 1 determine number of factors in solution

	library(nFactors)

	df <- bfi[,1:24] #select variables to include in fa
	df <- df[complete.cases(df),] #get complete cases
	ev <- eigen(cor(df)) # get eigenvalues
	ap <- parallel(subject=nrow(df),var=ncol(df),
	rep=100,cent=.05)
	nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
	plotnScree(nS)

	# from this result ~ 6 factors

	# step 2 apply fa and add to dataset

	df <- bfi[,1:24] #select variables to include in fa
	df <- df[complete.cases(df),] #get complete cases
	df_cor <- cor(df) # compute correlation matrix
	fit <- fa(r = df_cor, nfactors = 6, scores = TRUE) # factor analysis
	print(fit, digits=2, cutoff=.3, sort=TRUE) # print 'easy to read' loadings
	fs <- factor.scores(df, fit) #obtain factor scores
	fs <- fs$scores #get the columns of factor scores for each case
	df <- cbind(df,fs) #append factor scores to dataset


	# step 3 apply cluster analysis to the factors

	tmp <- df[c("MR1","MR2","MR3","MR4","MR5","MR6")]
	tmp <- scale(tmp) # standardize variables

	# Determine number of clusters
	wss <- (nrow(df)-1)*sum(apply(tmp,2,var))
	for (i in 2:15) wss[i] <- sum(kmeans(tmp,
	centers=i,iter.max = 50)$withinss)
	plot(1:15, wss, type="b", xlab="Number of Clusters",
	ylab="Within groups sum of squares")

	# based on plot determine # of clusters (~ 6)

	# K-Means Cluster Analysis
	fit <- kmeans(tmp, 5) # 5 cluster solution

	# get cluster means
	aggregate(tmp,by=list(fit$cluster),FUN=mean)

	# append cluster assignment
	df <- data.frame(df, fit$cluster)

	#-----------------------------------
	# calculate principal component loadings by hand
	#----------------------------------

	library(psych)

	#Loading the dataset
	bfi_data = bfi

	# Remove rows with missing values and keep only complete cases

	bfi_data=bfi_data[complete.cases(bfi_data),]

	# write.csv(bfi_data,"bfi_data.csv")

	# Create the correlation matrix from bfi_data
	bfi_cor <- cor(bfi_data)

	# perform PCA
	pc_bfi <- principal(bfi_cor, nfactors = 28, rotate = "none")
	pc_bfi$loadings


	# calculate eigenvectors and values 'by hand'
	ev <- eigen(bfi_cor) # calculate eigenvalues and vectors
	values <- ev$values # get eigenvalues
	vectors <- ev$vectors # get eigenvectors

	# loadings = vector values * sqrt(eigenvalues)
	#ex: calculate loadings for 1st PC

	(vectors[,1]*sqrt(values[1])) # 1st loading

	pc_bfi$loadings # compare to results above from pca routine