Last active
June 30, 2024 09:54
-
-
Save slowkow/c1e4cdc80e7b0e43bfca to your computer and use it in GitHub Desktop.
Perform Principal Components Analysis (PCA) in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Get the variance explained by principal components | |
#' @param plist A list with class "prcomp" containing: sdev, rotation, x. | |
variance_explained <- function(plist) { | |
rotation <- as.data.frame(plist$rotation) | |
variance <- plist$sdev ^ 2 | |
data.frame( | |
Component = factor(1:ncol(rotation), levels = 1:ncol(rotation)), | |
Variance = variance / sum(variance), | |
CumulativeVariance = ( cumsum(variance) / sum(variance) ) | |
) | |
} | |
#' Plot the variance explained by the first few principal components. | |
#' | |
#' @param plist A list with class "prcomp" containing: sdev, rotation, x, | |
#' center, scale. | |
#' @param n The number of principal components to plot. | |
#' @return A ggplot2 plot. | |
plot_variance_explained <- function(plist, n = 10, cumulative = FALSE) { | |
# Get the variance explained as a dataframe. | |
dat <- head(variance_explained(plist), n) | |
if (cumulative) { | |
# Cumulative line plot. | |
ggplot(data = dat) + | |
geom_line(alpha = 0.8, | |
aes(x = Component, y = CumulativeVariance, group = 1)) + | |
geom_point(size = 3, aes(x = Component, y = CumulativeVariance)) + | |
xlab("PC") + | |
ylab("Cumulative Fraction of Variance Explained") + | |
geom_abline(intercept = 0, slope = 100 * 1 / ncol(rotation), | |
alpha = 0.25) + | |
scale_y_continuous(limits = c(0, max(dat$CumulativeVariance))) | |
} else { | |
# Bar plot. | |
ggplot(data = dat) + | |
geom_bar(alpha = 0.8, aes(x = Component, weight = Variance)) + | |
ylab("Fraction of Variance Explained") + | |
xlab("") | |
} | |
} | |
#' Correlate principal components with factors in another data.frame. | |
#' | |
#' @param pca A "prcomp" object returned by prcomp(). | |
#' @param df A matrix. Each column is tested for correlation with the PCs. | |
correlate_pcs = function(pca, df, npcs = 5, min.cor = 0.5) { | |
pca.r = as.data.frame(pca$rotation)[ , 1:npcs] | |
df = as.data.frame(df) | |
# Make all of the columns numeric, so we can run cor(). | |
for (col in colnames(df)) { | |
if (!class(df[ , col]) == "numeric") { | |
df[ , col] = factorToNumeric(df[ , col]) | |
} | |
} | |
pca.cor = cor(cbind(pca.r, df)) | |
result = list() | |
for (col in colnames(df)) { | |
# Exclude weak correlations. | |
idx = abs(pca.cor[col, ]) > min.cor & abs(pca.cor[col, ]) < 1 | |
res = pca.cor[col, ][idx] | |
res = res[!is.na(res)] | |
if (length(res) > 1) { | |
result[[col]] = res[order(abs(res), decreasing = TRUE)] | |
} | |
} | |
result | |
} | |
#' List the genes with the greatest absolute values of loading factors. | |
#' @param pca A "prcomp" object returned by prcomp(). | |
#' @param n_pcs Integer number of principal components to list. | |
#' @param n_items Integer number of items listed for each componenet. | |
#' @return A list of items with loading values. | |
loading_values = function(pca, n_pcs = 2, n_items = 30) { | |
pc_scores = data.frame(pca$x) | |
result = list() | |
for (i in 1:n_pcs) { | |
code = paste0("PC", i) | |
sx = pc_scores[order(pc_scores[, code]), ] | |
result[[code]] = c( | |
head(sx[, code, drop = TRUE], n_items / 2), | |
tail(sx[, code, drop = TRUE], n_items / 2) | |
) | |
} | |
result | |
} | |
#' Return a numeric vector with the levels of a factor. | |
#' | |
#' The purpose of this function is to convert a non-numeric factor to a numeric | |
#' factor. This is useful when you want to compute correlation with non-numeric | |
#' vectors. | |
#' | |
#' @param xs A factor | |
#' @return A numeric vector. | |
factorToNumeric = function(xs) { | |
uniqs = unique(xs) | |
values = 1:length(uniqs) | |
as.numeric(values[match(xs, uniqs)]) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load convenience functions for PCA. | |
source("pca.R") | |
# Dependencies. | |
library(ggplot2) | |
# Let's start with two dataframes: | |
# 1. A dataframe called "dat" with rows = genes and columns = samples. | |
# 2. A dataframe called "meta" with covariates. It has one row for each sample. | |
# Typically, one of my first steps is to look at the distribution of standard deviation. | |
row_sd <- apply(dat, 1, sd) | |
plot(density(row_sd)) | |
# It is useful look at PCA for genes with high variability. | |
idx_sd <- row_sd > 0.5 | |
sum(idx_sd) | |
# I find that centering is necessary for expression data, but scaling is not always desired. | |
pca1 <- prcomp(dat[idx_sd, ], scale = FALSE, center = TRUE) | |
# Always check to see if the principal components explain a large amount of variation. | |
# Remember that PCA with fewer genes will always explain more variation than PCA with more genes. | |
plot_variance_explained(pca1) + theme_bw(base_size = 18) | |
# We'll use this to annotate the scatter plots with variance explained. | |
pca3_var_labs <- sprintf( | |
"PC%s, %.02g%% of variance", | |
variance_explained(pca1)$Component, | |
100 * variance_explained(pca1)$Variance | |
) | |
# We'll use this to annotate the plot with covariates. | |
pca1_r <- cbind(pca1$rotation, meta) | |
# For exploratory analysis, we should correlate covariates with principal components. | |
# This will help us decide which components to plot, and how to color points. | |
correlate_pcs(pca1, meta) | |
# I find that exploratory plotting is easiest with ggplot2. | |
ggplot(data = pca1_r) + | |
geom_point(size = 5, aes(PC1, PC2, color = Time)) + | |
geom_text(size = 4, hjust = 0, vjust = 0, aes(PC1, PC2, label = Index)) + | |
theme_bw(base_size = 18) + | |
labs(x = pca1_var_labs[1], | |
y = pca1_var_labs[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
THANKS