slowkow · June 30, 2024 09:54 · De-wiseking · Jun 30, 2024
diff --git a/pca.R b/pca.R
 #' Get the variance explained by principal components
 #' @param plist A list with class "prcomp" containing: sdev, rotation, x.
 variance_explained <- function(plist) {
  rotation <- as.data.frame(plist$rotation)
  variance <- plist$sdev ^ 2  
  data.frame(
    Component = factor(1:ncol(rotation), levels = 1:ncol(rotation)),
    Variance = variance / sum(variance),
    CumulativeVariance = ( cumsum(variance) / sum(variance) )
  )
 }

 #' Plot the variance explained by the first few principal components.
 #' 
 #' @param plist A list with class "prcomp" containing: sdev, rotation, x,
 #'  center, scale.
 #' @param n The number of principal components to plot.
 #' @return A ggplot2 plot.
 plot_variance_explained <- function(plist, n = 10, cumulative = FALSE) {
  # Get the variance explained as a dataframe.
  dat <- head(variance_explained(plist), n)
  
  if (cumulative) {
    # Cumulative line plot.
    ggplot(data = dat) +
      geom_line(alpha = 0.8,
                aes(x = Component, y = CumulativeVariance, group = 1)) +
      geom_point(size = 3, aes(x = Component, y = CumulativeVariance)) +
      xlab("PC") +
      ylab("Cumulative Fraction of Variance Explained") +
      geom_abline(intercept = 0, slope = 100 * 1 / ncol(rotation),
                  alpha = 0.25) +
      scale_y_continuous(limits = c(0, max(dat$CumulativeVariance)))
  } else {
    # Bar plot.
    ggplot(data = dat) +
      geom_bar(alpha = 0.8, aes(x = Component, weight = Variance)) +
      ylab("Fraction of Variance Explained") +
      xlab("")
  }
 }

 #' Correlate principal components with factors in another data.frame.
 #'
 #' @param pca A "prcomp" object returned by prcomp().
 #' @param df A matrix. Each column is tested for correlation with the PCs.
 correlate_pcs = function(pca, df, npcs = 5, min.cor = 0.5) {
  pca.r = as.data.frame(pca$rotation)[ , 1:npcs]
  df = as.data.frame(df)
  # Make all of the columns numeric, so we can run cor().
  for (col in colnames(df)) {
    if (!class(df[ , col]) == "numeric") {
      df[ , col] = factorToNumeric(df[ , col])
    }
  }
  pca.cor = cor(cbind(pca.r, df))
  result = list()
  for (col in colnames(df)) {
    # Exclude weak correlations.
    idx = abs(pca.cor[col, ]) > min.cor & abs(pca.cor[col, ]) < 1
    res = pca.cor[col, ][idx]
    res = res[!is.na(res)]
    if (length(res) > 1) {
      result[[col]] = res[order(abs(res), decreasing = TRUE)]
    }
  }
  result
 }

 #' List the genes with the greatest absolute values of loading factors.
 #' @param pca A "prcomp" object returned by prcomp().
 #' @param n_pcs Integer number of principal components to list.
 #' @param n_items Integer number of items listed for each componenet.
 #' @return A list of items with loading values.
 loading_values = function(pca, n_pcs = 2, n_items = 30) {
  pc_scores = data.frame(pca$x)
  result = list()
  for (i in 1:n_pcs) {
    code = paste0("PC", i)
    sx = pc_scores[order(pc_scores[, code]), ]
    result[[code]] = c(
      head(sx[, code, drop = TRUE], n_items / 2),
      tail(sx[, code, drop = TRUE], n_items / 2)
    )
  }
  result
 }

 #' Return a numeric vector with the levels of a factor.
 #'
 #' The purpose of this function is to convert a non-numeric factor to a numeric
 #' factor. This is useful when you want to compute correlation with non-numeric
 #' vectors.
 #'
 #' @param xs A factor
 #' @return A numeric vector.
 factorToNumeric = function(xs) {
  uniqs = unique(xs)
  values = 1:length(uniqs)
  as.numeric(values[match(xs, uniqs)])
 }
diff --git a/pca_example.R b/pca_example.R
 # Load convenience functions for PCA.
 source("pca.R")

 # Dependencies.
 library(ggplot2)

 # Let's start with two dataframes:
 # 1. A dataframe called "dat" with rows = genes and columns = samples.
 # 2. A dataframe called "meta" with covariates. It has one row for each sample.

 # Typically, one of my first steps is to look at the distribution of standard deviation.
 row_sd <- apply(dat, 1, sd)
 plot(density(row_sd))

 # It is useful look at PCA for genes with high variability.
 idx_sd <- row_sd > 0.5
 sum(idx_sd)

 # I find that centering is necessary for expression data, but scaling is not always desired.
 pca1 <- prcomp(dat[idx_sd, ], scale = FALSE, center = TRUE)

 # Always check to see if the principal components explain a large amount of variation.
 # Remember that PCA with fewer genes will always explain more variation than PCA with more genes.
 plot_variance_explained(pca1) + theme_bw(base_size = 18)

 # We'll use this to annotate the scatter plots with variance explained.
 pca3_var_labs <- sprintf(
  "PC%s, %.02g%% of variance",
  variance_explained(pca1)$Component,
  100 * variance_explained(pca1)$Variance
 )

 # We'll use this to annotate the plot with covariates.
 pca1_r <- cbind(pca1$rotation, meta)

 # For exploratory analysis, we should correlate covariates with principal components.
 # This will help us decide which components to plot, and how to color points.
 correlate_pcs(pca1, meta)

 # I find that exploratory plotting is easiest with ggplot2.
 ggplot(data = pca1_r) +
  geom_point(size = 5, aes(PC1, PC2, color = Time)) +
  geom_text(size = 4, hjust = 0, vjust = 0, aes(PC1, PC2, label = Index)) +
  theme_bw(base_size = 18) +
  labs(x = pca1_var_labs[1],
       y = pca1_var_labs[2])
	#' Get the variance explained by principal components
	#' @param plist A list with class "prcomp" containing: sdev, rotation, x.
	variance_explained <- function(plist) {
	rotation <- as.data.frame(plist$rotation)
	variance <- plist$sdev ^ 2
	data.frame(
	Component = factor(1:ncol(rotation), levels = 1:ncol(rotation)),
	Variance = variance / sum(variance),
	CumulativeVariance = ( cumsum(variance) / sum(variance) )
	)
	}

	#' Plot the variance explained by the first few principal components.
	#'
	#' @param plist A list with class "prcomp" containing: sdev, rotation, x,
	#' center, scale.
	#' @param n The number of principal components to plot.
	#' @return A ggplot2 plot.
	plot_variance_explained <- function(plist, n = 10, cumulative = FALSE) {
	# Get the variance explained as a dataframe.
	dat <- head(variance_explained(plist), n)

	if (cumulative) {
	# Cumulative line plot.
	ggplot(data = dat) +
	geom_line(alpha = 0.8,
	aes(x = Component, y = CumulativeVariance, group = 1)) +
	geom_point(size = 3, aes(x = Component, y = CumulativeVariance)) +
	xlab("PC") +
	ylab("Cumulative Fraction of Variance Explained") +
	geom_abline(intercept = 0, slope = 100 * 1 / ncol(rotation),
	alpha = 0.25) +
	scale_y_continuous(limits = c(0, max(dat$CumulativeVariance)))
	} else {
	# Bar plot.
	ggplot(data = dat) +
	geom_bar(alpha = 0.8, aes(x = Component, weight = Variance)) +
	ylab("Fraction of Variance Explained") +
	xlab("")
	}
	}

	#' Correlate principal components with factors in another data.frame.
	#'
	#' @param pca A "prcomp" object returned by prcomp().
	#' @param df A matrix. Each column is tested for correlation with the PCs.
	correlate_pcs = function(pca, df, npcs = 5, min.cor = 0.5) {
	pca.r = as.data.frame(pca$rotation)[ , 1:npcs]
	df = as.data.frame(df)
	# Make all of the columns numeric, so we can run cor().
	for (col in colnames(df)) {
	if (!class(df[ , col]) == "numeric") {
	df[ , col] = factorToNumeric(df[ , col])
	}
	}
	pca.cor = cor(cbind(pca.r, df))
	result = list()
	for (col in colnames(df)) {
	# Exclude weak correlations.
	idx = abs(pca.cor[col, ]) > min.cor & abs(pca.cor[col, ]) < 1
	res = pca.cor[col, ][idx]
	res = res[!is.na(res)]
	if (length(res) > 1) {
	result[[col]] = res[order(abs(res), decreasing = TRUE)]
	}
	}
	result
	}

	#' List the genes with the greatest absolute values of loading factors.
	#' @param pca A "prcomp" object returned by prcomp().
	#' @param n_pcs Integer number of principal components to list.
	#' @param n_items Integer number of items listed for each componenet.
	#' @return A list of items with loading values.
	loading_values = function(pca, n_pcs = 2, n_items = 30) {
	pc_scores = data.frame(pca$x)
	result = list()
	for (i in 1:n_pcs) {
	code = paste0("PC", i)
	sx = pc_scores[order(pc_scores[, code]), ]
	result[[code]] = c(
	head(sx[, code, drop = TRUE], n_items / 2),
	tail(sx[, code, drop = TRUE], n_items / 2)
	)
	}
	result
	}

	#' Return a numeric vector with the levels of a factor.
	#'
	#' The purpose of this function is to convert a non-numeric factor to a numeric
	#' factor. This is useful when you want to compute correlation with non-numeric
	#' vectors.
	#'
	#' @param xs A factor
	#' @return A numeric vector.
	factorToNumeric = function(xs) {
	uniqs = unique(xs)
	values = 1:length(uniqs)
	as.numeric(values[match(xs, uniqs)])
	}
	# Load convenience functions for PCA.
	source("pca.R")

	# Dependencies.
	library(ggplot2)

	# Let's start with two dataframes:
	# 1. A dataframe called "dat" with rows = genes and columns = samples.
	# 2. A dataframe called "meta" with covariates. It has one row for each sample.

	# Typically, one of my first steps is to look at the distribution of standard deviation.
	row_sd <- apply(dat, 1, sd)
	plot(density(row_sd))

	# It is useful look at PCA for genes with high variability.
	idx_sd <- row_sd > 0.5
	sum(idx_sd)

	# I find that centering is necessary for expression data, but scaling is not always desired.
	pca1 <- prcomp(dat[idx_sd, ], scale = FALSE, center = TRUE)

	# Always check to see if the principal components explain a large amount of variation.
	# Remember that PCA with fewer genes will always explain more variation than PCA with more genes.
	plot_variance_explained(pca1) + theme_bw(base_size = 18)

	# We'll use this to annotate the scatter plots with variance explained.
	pca3_var_labs <- sprintf(
	"PC%s, %.02g%% of variance",
	variance_explained(pca1)$Component,
	100 * variance_explained(pca1)$Variance
	)

	# We'll use this to annotate the plot with covariates.
	pca1_r <- cbind(pca1$rotation, meta)

	# For exploratory analysis, we should correlate covariates with principal components.
	# This will help us decide which components to plot, and how to color points.
	correlate_pcs(pca1, meta)

	# I find that exploratory plotting is easiest with ggplot2.
	ggplot(data = pca1_r) +
	geom_point(size = 5, aes(PC1, PC2, color = Time)) +
	geom_text(size = 4, hjust = 0, vjust = 0, aes(PC1, PC2, label = Index)) +
	theme_bw(base_size = 18) +
	labs(x = pca1_var_labs[1],
	y = pca1_var_labs[2])