hermidalc · May 16, 2021 22:01
diff --git a/analyze_rna_seq_gsea_preranked.R b/analyze_rna_seq_gsea_preranked.R
 suppressPackageStartupMessages({
    library(Biobase)
    library(data.table)
    library(edgeR)
    library(fgsea)
    library(msigdbr)
    library(ggplot2)
 })

 set.seed(777)

 fc <- 1.1
 lfc <- log2(fc)
 # Benjamini-Hochberg FDR multiple testing correction (standard)
 padj_meth <- "BH"
 gsea_padj <- 0.01

 # data types to have ready or read in from csv/tsv files
 # counts (feature x sample data matrix of integer counts)
 # pdata (sample x phenotypic data dataframe)
 # fdata (feature x feature annotation dataframe)

 # Riaz et al. pretreatment anti-PD1 response dataset example
 eset <- readRDS("srp094781_pre_ensg_htseq_counts.rds")
 counts <- exprs(eset)
 pdata <- pData(eset)
 fdata <- fData(eset)
 pdata$Group <- pdata$Class

 # if you have batches and you want to model them
 # pdata$Batch <- factor(pdata$Batch)

 # pdata column with differential compararison of interest
 # (e.g. subtype, metastatic)
 pdata$Group <- factor(pdata$Group)

 # if you have batches you want to include in model
 # formula <- ~Batch + Group

 formula <- ~Group
 design <- model.matrix(formula, data=pdata)

 # edgeR
 dge <- DGEList(counts=counts, genes=fdata)
 dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

 # normalization and differential expression
 dge <- calcNormFactors(dge, method="TMM")
 dge <- estimateDisp(dge, design, robust=TRUE)
 fit <- glmQLFit(dge, design, robust=TRUE)
 if (lfc > 0) {
    glt <- glmTreat(fit, coef=ncol(design), lfc=lfc)
 } else {
    glt <- glmQLFTest(fit, coef=ncol(design))
 }
 results <- as.data.frame(
    topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue")
 )

 # create DGE rank vector for GSEA
 ranks <- results$logFC * -log10(results$PValue)
 names(ranks) <- results$Symbol

 # exclude ENSGs that mapped to same symbol, keeping ones with best
 # absolute rank
 ranks <- ranks[order(abs(ranks), decreasing=TRUE)]
 ranks <- ranks[!duplicated(names(ranks))]

 # important: the official gene symbols you used in your dataset must be of the
 # same Ensembl/GENCODE version used to build the MSigDB version you are using
 msigdbr_df <- msigdbr(species="Homo sapiens")
 msigdbr_list <- split(x=msigdbr_df$gene_symbol, f=msigdbr_df$gs_name)

 # even though not set by default always good to only analyze gene sets with
 # sizes between ~15 and ~500 genes
 fgsea_res <- fgsea(
    pathways=msigdbr_list, stats=ranks, eps=0.0, minSize=15, maxSize=500
 )

 head(fgsea_res[order(pval), ], n=50)

 # enrichment plots
 plotEnrichment(
    msigdbr_list[["GO_T_CELL_ACTIVATION"]], ranks
 ) + labs(title="GO T-cell Activation")

 # table plot of top pathways
 top_pathways_up <- fgsea_res[ES > 0][head(order(pval), n=20), pathway]
 top_pathways_dn <- fgsea_res[ES < 0][head(order(pval), n=20), pathway]
 top_pathways <- c(top_pathways_up, rev(top_pathways_dn))
 plotGseaTable(msigdbr_list[top_pathways], ranks, fgsea_res, gseaParam=0.5)

 # table plot of collapsed main pathways
 collapsed_pathways <- collapsePathways(
    fgsea_res[order(pval)][padj < gsea_padj], msigdbr_list, ranks
 )
 main_pathways <- fgsea_res[pathway %in% collapsed_pathways$mainPathways]
 main_pathways <- main_pathways[order(-NES), pathway]
 plotGseaTable(msigdbr_list[main_pathways], ranks, fgsea_res, gseaParam=0.5)

 # write results
 fwrite(fgsea_res, file="fgsea_results.tsv", sep="\t", sep2=c("", " ", ""))
	suppressPackageStartupMessages({
	library(Biobase)
	library(data.table)
	library(edgeR)
	library(fgsea)
	library(msigdbr)
	library(ggplot2)
	})

	set.seed(777)

	fc <- 1.1
	lfc <- log2(fc)
	# Benjamini-Hochberg FDR multiple testing correction (standard)
	padj_meth <- "BH"
	gsea_padj <- 0.01

	# data types to have ready or read in from csv/tsv files
	# counts (feature x sample data matrix of integer counts)
	# pdata (sample x phenotypic data dataframe)
	# fdata (feature x feature annotation dataframe)

	# Riaz et al. pretreatment anti-PD1 response dataset example
	eset <- readRDS("srp094781_pre_ensg_htseq_counts.rds")
	counts <- exprs(eset)
	pdata <- pData(eset)
	fdata <- fData(eset)
	pdata$Group <- pdata$Class

	# if you have batches and you want to model them
	# pdata$Batch <- factor(pdata$Batch)

	# pdata column with differential compararison of interest
	# (e.g. subtype, metastatic)
	pdata$Group <- factor(pdata$Group)

	# if you have batches you want to include in model
	# formula <- ~Batch + Group

	formula <- ~Group
	design <- model.matrix(formula, data=pdata)

	# edgeR
	dge <- DGEList(counts=counts, genes=fdata)
	dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

	# normalization and differential expression
	dge <- calcNormFactors(dge, method="TMM")
	dge <- estimateDisp(dge, design, robust=TRUE)
	fit <- glmQLFit(dge, design, robust=TRUE)
	if (lfc > 0) {
	glt <- glmTreat(fit, coef=ncol(design), lfc=lfc)
	} else {
	glt <- glmQLFTest(fit, coef=ncol(design))
	}
	results <- as.data.frame(
	topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue")
	)

	# create DGE rank vector for GSEA
	ranks <- results$logFC * -log10(results$PValue)
	names(ranks) <- results$Symbol

	# exclude ENSGs that mapped to same symbol, keeping ones with best
	# absolute rank
	ranks <- ranks[order(abs(ranks), decreasing=TRUE)]
	ranks <- ranks[!duplicated(names(ranks))]

	# important: the official gene symbols you used in your dataset must be of the
	# same Ensembl/GENCODE version used to build the MSigDB version you are using
	msigdbr_df <- msigdbr(species="Homo sapiens")
	msigdbr_list <- split(x=msigdbr_df$gene_symbol, f=msigdbr_df$gs_name)

	# even though not set by default always good to only analyze gene sets with
	# sizes between ~15 and ~500 genes
	fgsea_res <- fgsea(
	pathways=msigdbr_list, stats=ranks, eps=0.0, minSize=15, maxSize=500
	)

	head(fgsea_res[order(pval), ], n=50)

	# enrichment plots
	plotEnrichment(
	msigdbr_list[["GO_T_CELL_ACTIVATION"]], ranks
	) + labs(title="GO T-cell Activation")

	# table plot of top pathways
	top_pathways_up <- fgsea_res[ES > 0][head(order(pval), n=20), pathway]
	top_pathways_dn <- fgsea_res[ES < 0][head(order(pval), n=20), pathway]
	top_pathways <- c(top_pathways_up, rev(top_pathways_dn))
	plotGseaTable(msigdbr_list[top_pathways], ranks, fgsea_res, gseaParam=0.5)

	# table plot of collapsed main pathways
	collapsed_pathways <- collapsePathways(
	fgsea_res[order(pval)][padj < gsea_padj], msigdbr_list, ranks
	)
	main_pathways <- fgsea_res[pathway %in% collapsed_pathways$mainPathways]
	main_pathways <- main_pathways[order(-NES), pathway]
	plotGseaTable(msigdbr_list[main_pathways], ranks, fgsea_res, gseaParam=0.5)

	# write results
	fwrite(fgsea_res, file="fgsea_results.tsv", sep="\t", sep2=c("", " ", ""))