Skip to content

Instantly share code, notes, and snippets.

@hermidalc
Created May 16, 2021 22:01
Show Gist options
  • Save hermidalc/583bc7de81a72bb3c4f3b13b97641b11 to your computer and use it in GitHub Desktop.
Save hermidalc/583bc7de81a72bb3c4f3b13b97641b11 to your computer and use it in GitHub Desktop.
RNA-seq differential gene expression GSEA preranked analysis
suppressPackageStartupMessages({
library(Biobase)
library(data.table)
library(edgeR)
library(fgsea)
library(msigdbr)
library(ggplot2)
})
set.seed(777)
fc <- 1.1
lfc <- log2(fc)
# Benjamini-Hochberg FDR multiple testing correction (standard)
padj_meth <- "BH"
gsea_padj <- 0.01
# data types to have ready or read in from csv/tsv files
# counts (feature x sample data matrix of integer counts)
# pdata (sample x phenotypic data dataframe)
# fdata (feature x feature annotation dataframe)
# Riaz et al. pretreatment anti-PD1 response dataset example
eset <- readRDS("srp094781_pre_ensg_htseq_counts.rds")
counts <- exprs(eset)
pdata <- pData(eset)
fdata <- fData(eset)
pdata$Group <- pdata$Class
# if you have batches and you want to model them
# pdata$Batch <- factor(pdata$Batch)
# pdata column with differential compararison of interest
# (e.g. subtype, metastatic)
pdata$Group <- factor(pdata$Group)
# if you have batches you want to include in model
# formula <- ~Batch + Group
formula <- ~Group
design <- model.matrix(formula, data=pdata)
# edgeR
dge <- DGEList(counts=counts, genes=fdata)
dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]
# normalization and differential expression
dge <- calcNormFactors(dge, method="TMM")
dge <- estimateDisp(dge, design, robust=TRUE)
fit <- glmQLFit(dge, design, robust=TRUE)
if (lfc > 0) {
glt <- glmTreat(fit, coef=ncol(design), lfc=lfc)
} else {
glt <- glmQLFTest(fit, coef=ncol(design))
}
results <- as.data.frame(
topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue")
)
# create DGE rank vector for GSEA
ranks <- results$logFC * -log10(results$PValue)
names(ranks) <- results$Symbol
# exclude ENSGs that mapped to same symbol, keeping ones with best
# absolute rank
ranks <- ranks[order(abs(ranks), decreasing=TRUE)]
ranks <- ranks[!duplicated(names(ranks))]
# important: the official gene symbols you used in your dataset must be of the
# same Ensembl/GENCODE version used to build the MSigDB version you are using
msigdbr_df <- msigdbr(species="Homo sapiens")
msigdbr_list <- split(x=msigdbr_df$gene_symbol, f=msigdbr_df$gs_name)
# even though not set by default always good to only analyze gene sets with
# sizes between ~15 and ~500 genes
fgsea_res <- fgsea(
pathways=msigdbr_list, stats=ranks, eps=0.0, minSize=15, maxSize=500
)
head(fgsea_res[order(pval), ], n=50)
# enrichment plots
plotEnrichment(
msigdbr_list[["GO_T_CELL_ACTIVATION"]], ranks
) + labs(title="GO T-cell Activation")
# table plot of top pathways
top_pathways_up <- fgsea_res[ES > 0][head(order(pval), n=20), pathway]
top_pathways_dn <- fgsea_res[ES < 0][head(order(pval), n=20), pathway]
top_pathways <- c(top_pathways_up, rev(top_pathways_dn))
plotGseaTable(msigdbr_list[top_pathways], ranks, fgsea_res, gseaParam=0.5)
# table plot of collapsed main pathways
collapsed_pathways <- collapsePathways(
fgsea_res[order(pval)][padj < gsea_padj], msigdbr_list, ranks
)
main_pathways <- fgsea_res[pathway %in% collapsed_pathways$mainPathways]
main_pathways <- main_pathways[order(-NES), pathway]
plotGseaTable(msigdbr_list[main_pathways], ranks, fgsea_res, gseaParam=0.5)
# write results
fwrite(fgsea_res, file="fgsea_results.tsv", sep="\t", sep2=c("", " ", ""))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment