Created
May 16, 2021 22:01
-
-
Save hermidalc/583bc7de81a72bb3c4f3b13b97641b11 to your computer and use it in GitHub Desktop.
RNA-seq differential gene expression GSEA preranked analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
suppressPackageStartupMessages({ | |
library(Biobase) | |
library(data.table) | |
library(edgeR) | |
library(fgsea) | |
library(msigdbr) | |
library(ggplot2) | |
}) | |
set.seed(777) | |
fc <- 1.1 | |
lfc <- log2(fc) | |
# Benjamini-Hochberg FDR multiple testing correction (standard) | |
padj_meth <- "BH" | |
gsea_padj <- 0.01 | |
# data types to have ready or read in from csv/tsv files | |
# counts (feature x sample data matrix of integer counts) | |
# pdata (sample x phenotypic data dataframe) | |
# fdata (feature x feature annotation dataframe) | |
# Riaz et al. pretreatment anti-PD1 response dataset example | |
eset <- readRDS("srp094781_pre_ensg_htseq_counts.rds") | |
counts <- exprs(eset) | |
pdata <- pData(eset) | |
fdata <- fData(eset) | |
pdata$Group <- pdata$Class | |
# if you have batches and you want to model them | |
# pdata$Batch <- factor(pdata$Batch) | |
# pdata column with differential compararison of interest | |
# (e.g. subtype, metastatic) | |
pdata$Group <- factor(pdata$Group) | |
# if you have batches you want to include in model | |
# formula <- ~Batch + Group | |
formula <- ~Group | |
design <- model.matrix(formula, data=pdata) | |
# edgeR | |
dge <- DGEList(counts=counts, genes=fdata) | |
dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE] | |
# normalization and differential expression | |
dge <- calcNormFactors(dge, method="TMM") | |
dge <- estimateDisp(dge, design, robust=TRUE) | |
fit <- glmQLFit(dge, design, robust=TRUE) | |
if (lfc > 0) { | |
glt <- glmTreat(fit, coef=ncol(design), lfc=lfc) | |
} else { | |
glt <- glmQLFTest(fit, coef=ncol(design)) | |
} | |
results <- as.data.frame( | |
topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue") | |
) | |
# create DGE rank vector for GSEA | |
ranks <- results$logFC * -log10(results$PValue) | |
names(ranks) <- results$Symbol | |
# exclude ENSGs that mapped to same symbol, keeping ones with best | |
# absolute rank | |
ranks <- ranks[order(abs(ranks), decreasing=TRUE)] | |
ranks <- ranks[!duplicated(names(ranks))] | |
# important: the official gene symbols you used in your dataset must be of the | |
# same Ensembl/GENCODE version used to build the MSigDB version you are using | |
msigdbr_df <- msigdbr(species="Homo sapiens") | |
msigdbr_list <- split(x=msigdbr_df$gene_symbol, f=msigdbr_df$gs_name) | |
# even though not set by default always good to only analyze gene sets with | |
# sizes between ~15 and ~500 genes | |
fgsea_res <- fgsea( | |
pathways=msigdbr_list, stats=ranks, eps=0.0, minSize=15, maxSize=500 | |
) | |
head(fgsea_res[order(pval), ], n=50) | |
# enrichment plots | |
plotEnrichment( | |
msigdbr_list[["GO_T_CELL_ACTIVATION"]], ranks | |
) + labs(title="GO T-cell Activation") | |
# table plot of top pathways | |
top_pathways_up <- fgsea_res[ES > 0][head(order(pval), n=20), pathway] | |
top_pathways_dn <- fgsea_res[ES < 0][head(order(pval), n=20), pathway] | |
top_pathways <- c(top_pathways_up, rev(top_pathways_dn)) | |
plotGseaTable(msigdbr_list[top_pathways], ranks, fgsea_res, gseaParam=0.5) | |
# table plot of collapsed main pathways | |
collapsed_pathways <- collapsePathways( | |
fgsea_res[order(pval)][padj < gsea_padj], msigdbr_list, ranks | |
) | |
main_pathways <- fgsea_res[pathway %in% collapsed_pathways$mainPathways] | |
main_pathways <- main_pathways[order(-NES), pathway] | |
plotGseaTable(msigdbr_list[main_pathways], ranks, fgsea_res, gseaParam=0.5) | |
# write results | |
fwrite(fgsea_res, file="fgsea_results.tsv", sep="\t", sep2=c("", " ", "")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment