padpadpadpad · July 24, 2024 16:59
diff --git a/vegan_PCA_ggplot_example.R b/vegan_PCA_ggplot_example.R
 # creating pretty plots from prcomp and vegan in R

 # clear workspace
 mise::mise(vars = TRUE, pkgs = TRUE, console = TRUE, figs = TRUE)

 # can do multiple comparisons of adonis factors
 # devtools::install_github('leffj/mctoolsr')

 # load in packages
 library(vegan)
 library(ggplot2)
 library(dplyr)
 library(tidyr)
 library(magrittr)
 library(mctoolsr)
 library(broom)

 # load in example data from vegan
 data(varespec)

 # First 16 sites grazed, remaining 8 sites ungrazed
 # create a metadata dataset
 # creating a column to identify each row of the varespec data with
 d_vars <- data.frame(groups = as.character(c(rep('grazed',16), rep('ungrazed',8))),
                     row = row.names(varespec))

 # plotting a PCA in ggplot2
 PCA <- prcomp(varespec)

 # view summary stats of PCA
 plot(PCA, type = "l")
 summary(PCA)

 # view biplot
 biplot(PCA)

 # These are not very good!!!!

 # can extract all the data using broom
 # get PCA summary
 d_PCA_sum <- broom::tidy(PCA, matrix = 'pcs')

 # get PCA loadings
 d_PCA_load <- broom::tidy(PCA, matrix = 'v') %>%
  mutate(., PC = paste('PC_', PC, sep = '')) %>%
  spread(., PC, value)

 # subset for just top 15 loadings
 # if you have a lot of genes this is advisable otherwise you will have so much text!
 # I calculate the distance of each point in the PC1 and PC2 from 0,0 and subset those

 # function to get distance from 00
 dist_from_00 <- function(x, y){
  return(sqrt((0 - x)^2+(0-y)^2))
 }

 top_15_loadings <- mutate(d_PCA_load, distance = dist_from_00(PC_1, PC_2)) %>%
  top_n(., 15, distance)

 # get PCA points
 d_PCA_points <- broom::tidy(PCA, matrix = 'samples') %>%
  mutate(., row = as.character(row),
         PC = paste('PC_', PC, sep = '')) %>%
  merge(., d_vars, by = 'row') %>%
  spread(., PC, value)

 # fancy ggplot of PCA
 ggplot() +
  geom_text(aes(PC_1, PC_2, label = column, hjust = 0.5*(1 - sign(PC_1)), vjust = 0.5*(1-sign(PC_2))), d_PCA_load, col = 'red4') +
  geom_segment(aes(x = 0, y = 0, yend = PC_2, xend = PC_1, group = column), d_PCA_load, arrow = arrow(length = unit(0.01, "npc")), col = 'red4') +
  geom_point(aes(PC_1, PC_2, col = groups), d_PCA_points) +
  theme_bw(base_size = 12, base_family = 'Helvetica') +
  coord_fixed(sqrt(d_PCA_sum$percent[2]/d_PCA_sum$percent[1])) +
  stat_ellipse(aes(PC_1, PC_2, col = groups, group = groups), d_PCA_points, type = "t") +
  ylab('PCA 2 [25.4%]') +
  xlab('PCA 1 [53.8%]') +
  scale_fill_manual('', values = c('black', 'grey'), labels = c('Grazed', 'Ungrazed')) +
  scale_color_manual('', values = c('black', 'grey'), labels = c('Grazed', 'Ungrazed')) +
  theme(legend.position = c(0.85, 0.85))

 # this does not work too well for this example dataset but will work better for others!

 # do adonis and multiple contrasts
 # distance matrices
 Euclid_matrix <- dist(varespec)

 # make some character vectors factors
 d_vars <- mutate_at(d_vars, c('groups'), as.factor)
 row.names(d_vars) <- d_vars$row

 # adonis
 mod1_euclid <- adonis(Euclid_matrix ~ groups, data = d_vars)

 # do pairwise contrasts - look at different ways of doing this! ####
 # using mctoolsr from GitHub - this looks good
 # NOTE - need to make sure that variable dataset has rownames equivalent to the distance matrix
 euclid_contrasts <- mctoolsr::calc_pairwise_permanovas(Euclid_matrix, d_vars, 'groups')
 # obviously a bit redundant when there are only two levels
 # make a column of all unique combinations of factors and insert instead of "groups"

 # betadisper() analysis and plotting in ggplot2 ####

 # Looking at variance across groups ####

 # can only do one factor would use every combination of id
 mod1_dispers <- betadisper(Euclid_matrix, d_vars$groups)

 # plot of model
 plot(mod1_dispers)
 boxplot(mod1_dispers)

 # anova
 anova(mod1_dispers)

 # Permutation test for F
 pmod <- permutest(mod1_dispers, pairwise = TRUE)

 # Tukey's Honest Significant Differences
 T_HSD <- TukeyHSD(mod1_dispers)

 # get betadisper dataframes ####
 # have written functions to grab the necessary data from the betadisper object

 # functions ####
 # getting distances from betadisper() object
 betadisper_distances <- function(model){
  temp <- data.frame(group = model$group)
  temp2 <- data.frame(distances = unlist(model$distances))
  temp2$sample <- row.names(temp2)
  temp <- cbind(temp, temp2)
  temp <- dplyr::select(temp, group, sample, dplyr::everything())
  row.names(temp) <- NULL
  return(temp)
 }

 # getting eigenvalues out of betadisper() object
 betadisper_eigenvalue <- function(model){
  temp <- data.frame(eig = unlist(model$eig))
  temp$PCoA <- row.names(temp)
  row.names(temp) <- NULL
  return(temp)
 }

 # getting the eigenvectors out of a betadisper() object
 betadisper_eigenvector <- function(model){
  temp <- data.frame(group = model$group)
  temp2 <- data.frame(unlist(model$vectors))
  temp2$sample <- row.names(temp2)
  temp <- cbind(temp, temp2)
  temp <- dplyr::select(temp, group, sample, dplyr::everything())
  row.names(temp) <- NULL
  return(temp)
 }

 # get centroids
 betadisper_centroids <- function(model){
  temp <- data.frame(unlist(model$centroids))
  temp$group <- row.names(temp)
  temp <- dplyr::select(temp, group, dplyr::everything())
  row.names(temp) <- NULL
  return(temp)
 }

 # betadisper data
 get_betadisper_data <- function(model){
  temp <- list(distances = betadisper_distances(model),
               eigenvalue = betadisper_eigenvalue(model),
               eigenvector = betadisper_eigenvector(model),
               centroids = betadisper_centroids(model))
  return(temp)
 }

 # get betadisper data ####
 betadisper_dat <- get_betadisper_data(mod1_dispers)

 # do some transformations on the data
 betadisper_dat$eigenvalue <- mutate(betadisper_dat$eigenvalue, percent = eig/sum(eig))

 # add convex hull points ####
 # this could be put in a function
 betadisper_dat$chull <- group_by(betadisper_dat$eigenvector, group) %>%
  do(data.frame(PCoA1 = .$PCoA1[c(chull(.$PCoA1, .$PCoA2), chull(.$PCoA1, .$PCoA2)[1])],
                PCoA2 = .$PCoA2[c(chull(.$PCoA1, .$PCoA2), chull(.$PCoA1, .$PCoA2)[1])])) %>%
  data.frame()

 # combine centroid and eigenvector dataframes for plotting
 betadisper_lines <- merge(select(betadisper_dat$centroids, group, PCoA1, PCoA2), select(betadisper_dat$eigenvector, group, PCoA1, PCoA2), by = c('group'))

 # Now the dataframes are all ready to be completely customisable in ggplot
 # plot betadispersion plot
 ggplot() +
  geom_point(aes(PCoA1, PCoA2, col = group), betadisper_dat$centroids, size = 4) +
  geom_point(aes(PCoA1, PCoA2, col = group), betadisper_dat$eigenvector) +
  geom_path(aes(PCoA1, PCoA2, col = group, group = group), betadisper_dat$chull ) +
  geom_segment(aes(x = PCoA1.x, y = PCoA2.x, yend = PCoA2.y, xend = PCoA1.y, group = row.names(betadisper_lines), col = group), betadisper_lines) +
  theme_bw(base_size = 12, base_family = 'Helvetica') +
  ylab('PCoA Axis 2 [25.4%]') +
  xlab('PCoA Axis 2 [53.8%]') +
  scale_color_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
  theme(legend.position = c(0.9, 0.3)) +
  coord_fixed(sqrt(d_PCA_sum$percent[2]/d_PCA_sum$percent[1]))

 # plot distances from centroid
 ggplot(betadisper_dat$distances, aes(group, distances, fill = group, col = group)) +
  geom_boxplot(aes(fill = group, col = group), outlier.shape = NA, width = 0.5, position = position_dodge(width = 0.55)) +
  stat_summary(position = position_dodge(width = 0.55), geom = 'crossbar', fatten = 0, color = 'white', width = 0.4, fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x)))}) +
  geom_point(aes(group, distances, col = group), shape = 21, fill ='white', position = position_jitterdodge(dodge.width = 0.55, jitter.width = 0.2)) +
  theme_bw(base_size = 12, base_family = 'Helvetica') +
  scale_color_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
  scale_fill_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
  ylab('Distance to centroid') +
  theme(legend.position = 'none') +
  xlab('') +
  scale_x_discrete(labels = c('Grazed', 'Ungrazed'))
	# creating pretty plots from prcomp and vegan in R

	# clear workspace
	mise::mise(vars = TRUE, pkgs = TRUE, console = TRUE, figs = TRUE)

	# can do multiple comparisons of adonis factors
	# devtools::install_github('leffj/mctoolsr')

	# load in packages
	library(vegan)
	library(ggplot2)
	library(dplyr)
	library(tidyr)
	library(magrittr)
	library(mctoolsr)
	library(broom)

	# load in example data from vegan
	data(varespec)

	# First 16 sites grazed, remaining 8 sites ungrazed
	# create a metadata dataset
	# creating a column to identify each row of the varespec data with
	d_vars <- data.frame(groups = as.character(c(rep('grazed',16), rep('ungrazed',8))),
	row = row.names(varespec))

	# plotting a PCA in ggplot2
	PCA <- prcomp(varespec)

	# view summary stats of PCA
	plot(PCA, type = "l")
	summary(PCA)

	# view biplot
	biplot(PCA)

	# These are not very good!!!!

	# can extract all the data using broom
	# get PCA summary
	d_PCA_sum <- broom::tidy(PCA, matrix = 'pcs')

	# get PCA loadings
	d_PCA_load <- broom::tidy(PCA, matrix = 'v') %>%
	mutate(., PC = paste('PC_', PC, sep = '')) %>%
	spread(., PC, value)

	# subset for just top 15 loadings
	# if you have a lot of genes this is advisable otherwise you will have so much text!
	# I calculate the distance of each point in the PC1 and PC2 from 0,0 and subset those

	# function to get distance from 00
	dist_from_00 <- function(x, y){
	return(sqrt((0 - x)^2+(0-y)^2))
	}

	top_15_loadings <- mutate(d_PCA_load, distance = dist_from_00(PC_1, PC_2)) %>%
	top_n(., 15, distance)

	# get PCA points
	d_PCA_points <- broom::tidy(PCA, matrix = 'samples') %>%
	mutate(., row = as.character(row),
	PC = paste('PC_', PC, sep = '')) %>%
	merge(., d_vars, by = 'row') %>%
	spread(., PC, value)

	# fancy ggplot of PCA
	ggplot() +
	geom_text(aes(PC_1, PC_2, label = column, hjust = 0.5(1 - sign(PC_1)), vjust = 0.5(1-sign(PC_2))), d_PCA_load, col = 'red4') +
	geom_segment(aes(x = 0, y = 0, yend = PC_2, xend = PC_1, group = column), d_PCA_load, arrow = arrow(length = unit(0.01, "npc")), col = 'red4') +
	geom_point(aes(PC_1, PC_2, col = groups), d_PCA_points) +
	theme_bw(base_size = 12, base_family = 'Helvetica') +
	coord_fixed(sqrt(d_PCA_sum$percent[2]/d_PCA_sum$percent[1])) +
	stat_ellipse(aes(PC_1, PC_2, col = groups, group = groups), d_PCA_points, type = "t") +
	ylab('PCA 2 [25.4%]') +
	xlab('PCA 1 [53.8%]') +
	scale_fill_manual('', values = c('black', 'grey'), labels = c('Grazed', 'Ungrazed')) +
	scale_color_manual('', values = c('black', 'grey'), labels = c('Grazed', 'Ungrazed')) +
	theme(legend.position = c(0.85, 0.85))

	# this does not work too well for this example dataset but will work better for others!

	# do adonis and multiple contrasts
	# distance matrices
	Euclid_matrix <- dist(varespec)

	# make some character vectors factors
	d_vars <- mutate_at(d_vars, c('groups'), as.factor)
	row.names(d_vars) <- d_vars$row

	# adonis
	mod1_euclid <- adonis(Euclid_matrix ~ groups, data = d_vars)

	# do pairwise contrasts - look at different ways of doing this! ####
	# using mctoolsr from GitHub - this looks good
	# NOTE - need to make sure that variable dataset has rownames equivalent to the distance matrix
	euclid_contrasts <- mctoolsr::calc_pairwise_permanovas(Euclid_matrix, d_vars, 'groups')
	# obviously a bit redundant when there are only two levels
	# make a column of all unique combinations of factors and insert instead of "groups"

	# betadisper() analysis and plotting in ggplot2 ####

	# Looking at variance across groups ####

	# can only do one factor would use every combination of id
	mod1_dispers <- betadisper(Euclid_matrix, d_vars$groups)

	# plot of model
	plot(mod1_dispers)
	boxplot(mod1_dispers)

	# anova
	anova(mod1_dispers)

	# Permutation test for F
	pmod <- permutest(mod1_dispers, pairwise = TRUE)

	# Tukey's Honest Significant Differences
	T_HSD <- TukeyHSD(mod1_dispers)

	# get betadisper dataframes ####
	# have written functions to grab the necessary data from the betadisper object

	# functions ####
	# getting distances from betadisper() object
	betadisper_distances <- function(model){
	temp <- data.frame(group = model$group)
	temp2 <- data.frame(distances = unlist(model$distances))
	temp2$sample <- row.names(temp2)
	temp <- cbind(temp, temp2)
	temp <- dplyr::select(temp, group, sample, dplyr::everything())
	row.names(temp) <- NULL
	return(temp)
	}

	# getting eigenvalues out of betadisper() object
	betadisper_eigenvalue <- function(model){
	temp <- data.frame(eig = unlist(model$eig))
	temp$PCoA <- row.names(temp)
	row.names(temp) <- NULL
	return(temp)
	}

	# getting the eigenvectors out of a betadisper() object
	betadisper_eigenvector <- function(model){
	temp <- data.frame(group = model$group)
	temp2 <- data.frame(unlist(model$vectors))
	temp2$sample <- row.names(temp2)
	temp <- cbind(temp, temp2)
	temp <- dplyr::select(temp, group, sample, dplyr::everything())
	row.names(temp) <- NULL
	return(temp)
	}

	# get centroids
	betadisper_centroids <- function(model){
	temp <- data.frame(unlist(model$centroids))
	temp$group <- row.names(temp)
	temp <- dplyr::select(temp, group, dplyr::everything())
	row.names(temp) <- NULL
	return(temp)
	}

	# betadisper data
	get_betadisper_data <- function(model){
	temp <- list(distances = betadisper_distances(model),
	eigenvalue = betadisper_eigenvalue(model),
	eigenvector = betadisper_eigenvector(model),
	centroids = betadisper_centroids(model))
	return(temp)
	}

	# get betadisper data ####
	betadisper_dat <- get_betadisper_data(mod1_dispers)

	# do some transformations on the data
	betadisper_dat$eigenvalue <- mutate(betadisper_dat$eigenvalue, percent = eig/sum(eig))

	# add convex hull points ####
	# this could be put in a function
	betadisper_dat$chull <- group_by(betadisper_dat$eigenvector, group) %>%
	do(data.frame(PCoA1 = .$PCoA1[c(chull(.$PCoA1, .$PCoA2), chull(.$PCoA1, .$PCoA2)[1])],
	PCoA2 = .$PCoA2[c(chull(.$PCoA1, .$PCoA2), chull(.$PCoA1, .$PCoA2)[1])])) %>%
	data.frame()

	# combine centroid and eigenvector dataframes for plotting
	betadisper_lines <- merge(select(betadisper_dat$centroids, group, PCoA1, PCoA2), select(betadisper_dat$eigenvector, group, PCoA1, PCoA2), by = c('group'))

	# Now the dataframes are all ready to be completely customisable in ggplot
	# plot betadispersion plot
	ggplot() +
	geom_point(aes(PCoA1, PCoA2, col = group), betadisper_dat$centroids, size = 4) +
	geom_point(aes(PCoA1, PCoA2, col = group), betadisper_dat$eigenvector) +
	geom_path(aes(PCoA1, PCoA2, col = group, group = group), betadisper_dat$chull ) +
	geom_segment(aes(x = PCoA1.x, y = PCoA2.x, yend = PCoA2.y, xend = PCoA1.y, group = row.names(betadisper_lines), col = group), betadisper_lines) +
	theme_bw(base_size = 12, base_family = 'Helvetica') +
	ylab('PCoA Axis 2 [25.4%]') +
	xlab('PCoA Axis 2 [53.8%]') +
	scale_color_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
	theme(legend.position = c(0.9, 0.3)) +
	coord_fixed(sqrt(d_PCA_sum$percent[2]/d_PCA_sum$percent[1]))

	# plot distances from centroid
	ggplot(betadisper_dat$distances, aes(group, distances, fill = group, col = group)) +
	geom_boxplot(aes(fill = group, col = group), outlier.shape = NA, width = 0.5, position = position_dodge(width = 0.55)) +
	stat_summary(position = position_dodge(width = 0.55), geom = 'crossbar', fatten = 0, color = 'white', width = 0.4, fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x)))}) +
	geom_point(aes(group, distances, col = group), shape = 21, fill ='white', position = position_jitterdodge(dodge.width = 0.55, jitter.width = 0.2)) +
	theme_bw(base_size = 12, base_family = 'Helvetica') +
	scale_color_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
	scale_fill_manual('', values = c('black', 'grey'), labels = c("Grazed", 'Ungrazed')) +
	ylab('Distance to centroid') +
	theme(legend.position = 'none') +
	xlab('') +
	scale_x_discrete(labels = c('Grazed', 'Ungrazed'))