Skip to content

Instantly share code, notes, and snippets.

@slowkow
Last active June 30, 2024 09:54
Show Gist options
  • Save slowkow/c1e4cdc80e7b0e43bfca to your computer and use it in GitHub Desktop.
Save slowkow/c1e4cdc80e7b0e43bfca to your computer and use it in GitHub Desktop.
Perform Principal Components Analysis (PCA) in R
#' Get the variance explained by principal components
#' @param plist A list with class "prcomp" containing: sdev, rotation, x.
variance_explained <- function(plist) {
rotation <- as.data.frame(plist$rotation)
variance <- plist$sdev ^ 2
data.frame(
Component = factor(1:ncol(rotation), levels = 1:ncol(rotation)),
Variance = variance / sum(variance),
CumulativeVariance = ( cumsum(variance) / sum(variance) )
)
}
#' Plot the variance explained by the first few principal components.
#'
#' @param plist A list with class "prcomp" containing: sdev, rotation, x,
#' center, scale.
#' @param n The number of principal components to plot.
#' @return A ggplot2 plot.
plot_variance_explained <- function(plist, n = 10, cumulative = FALSE) {
# Get the variance explained as a dataframe.
dat <- head(variance_explained(plist), n)
if (cumulative) {
# Cumulative line plot.
ggplot(data = dat) +
geom_line(alpha = 0.8,
aes(x = Component, y = CumulativeVariance, group = 1)) +
geom_point(size = 3, aes(x = Component, y = CumulativeVariance)) +
xlab("PC") +
ylab("Cumulative Fraction of Variance Explained") +
geom_abline(intercept = 0, slope = 100 * 1 / ncol(rotation),
alpha = 0.25) +
scale_y_continuous(limits = c(0, max(dat$CumulativeVariance)))
} else {
# Bar plot.
ggplot(data = dat) +
geom_bar(alpha = 0.8, aes(x = Component, weight = Variance)) +
ylab("Fraction of Variance Explained") +
xlab("")
}
}
#' Correlate principal components with factors in another data.frame.
#'
#' @param pca A "prcomp" object returned by prcomp().
#' @param df A matrix. Each column is tested for correlation with the PCs.
correlate_pcs = function(pca, df, npcs = 5, min.cor = 0.5) {
pca.r = as.data.frame(pca$rotation)[ , 1:npcs]
df = as.data.frame(df)
# Make all of the columns numeric, so we can run cor().
for (col in colnames(df)) {
if (!class(df[ , col]) == "numeric") {
df[ , col] = factorToNumeric(df[ , col])
}
}
pca.cor = cor(cbind(pca.r, df))
result = list()
for (col in colnames(df)) {
# Exclude weak correlations.
idx = abs(pca.cor[col, ]) > min.cor & abs(pca.cor[col, ]) < 1
res = pca.cor[col, ][idx]
res = res[!is.na(res)]
if (length(res) > 1) {
result[[col]] = res[order(abs(res), decreasing = TRUE)]
}
}
result
}
#' List the genes with the greatest absolute values of loading factors.
#' @param pca A "prcomp" object returned by prcomp().
#' @param n_pcs Integer number of principal components to list.
#' @param n_items Integer number of items listed for each componenet.
#' @return A list of items with loading values.
loading_values = function(pca, n_pcs = 2, n_items = 30) {
pc_scores = data.frame(pca$x)
result = list()
for (i in 1:n_pcs) {
code = paste0("PC", i)
sx = pc_scores[order(pc_scores[, code]), ]
result[[code]] = c(
head(sx[, code, drop = TRUE], n_items / 2),
tail(sx[, code, drop = TRUE], n_items / 2)
)
}
result
}
#' Return a numeric vector with the levels of a factor.
#'
#' The purpose of this function is to convert a non-numeric factor to a numeric
#' factor. This is useful when you want to compute correlation with non-numeric
#' vectors.
#'
#' @param xs A factor
#' @return A numeric vector.
factorToNumeric = function(xs) {
uniqs = unique(xs)
values = 1:length(uniqs)
as.numeric(values[match(xs, uniqs)])
}
# Load convenience functions for PCA.
source("pca.R")
# Dependencies.
library(ggplot2)
# Let's start with two dataframes:
# 1. A dataframe called "dat" with rows = genes and columns = samples.
# 2. A dataframe called "meta" with covariates. It has one row for each sample.
# Typically, one of my first steps is to look at the distribution of standard deviation.
row_sd <- apply(dat, 1, sd)
plot(density(row_sd))
# It is useful look at PCA for genes with high variability.
idx_sd <- row_sd > 0.5
sum(idx_sd)
# I find that centering is necessary for expression data, but scaling is not always desired.
pca1 <- prcomp(dat[idx_sd, ], scale = FALSE, center = TRUE)
# Always check to see if the principal components explain a large amount of variation.
# Remember that PCA with fewer genes will always explain more variation than PCA with more genes.
plot_variance_explained(pca1) + theme_bw(base_size = 18)
# We'll use this to annotate the scatter plots with variance explained.
pca3_var_labs <- sprintf(
"PC%s, %.02g%% of variance",
variance_explained(pca1)$Component,
100 * variance_explained(pca1)$Variance
)
# We'll use this to annotate the plot with covariates.
pca1_r <- cbind(pca1$rotation, meta)
# For exploratory analysis, we should correlate covariates with principal components.
# This will help us decide which components to plot, and how to color points.
correlate_pcs(pca1, meta)
# I find that exploratory plotting is easiest with ggplot2.
ggplot(data = pca1_r) +
geom_point(size = 5, aes(PC1, PC2, color = Time)) +
geom_text(size = 4, hjust = 0, vjust = 0, aes(PC1, PC2, label = Index)) +
theme_bw(base_size = 18) +
labs(x = pca1_var_labs[1],
y = pca1_var_labs[2])
@De-wiseking
Copy link

THANKS

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment