hermidalc · September 3, 2024 11:49
diff --git a/analyze_rna_seq_diff_expr.R b/analyze_rna_seq_diff_expr.R
 suppressPackageStartupMessages({
    library(Biobase)
    library(DESeq2)
    library(edgeR)
    library(EnhancedVolcano)
    library(limma)
 })

 fc <- 1.0
 lfc <- log2(fc)
 padj <- 0.1
 # Benjamini-Hochberg FDR multiple testing correction (standard)
 padj_meth <- "BH"
 fig_dim <- 8
 fig_res <- 300
 xlim <- c(-4, 4)
 ylim <- c(0, 4)

 # data types to have ready or read in from csv/tsv files
 # counts (feature x sample data matrix of integer counts)
 # pdata (sample x phenotypic data dataframe)
 # fdata (feature x feature annotation dataframe)

 # Hugo et al. pretreatment anti-PD1 response dataset example
 eset <- readRDS("srp070710_pre_ensg_htseq_counts.rds")
 counts <- exprs(eset)
 pdata <- pData(eset)
 fdata <- fData(eset)
 pdata$Group <- pdata$Class

 # if you have batches and you want to model them
 # pdata$Batch <- factor(pdata$Batch)

 # pdata column with differential compararison of interest
 # (e.g. subtype, metastatic)
 pdata$Group <- factor(pdata$Group)

 # if you have batches you want to include in model
 # formula <- ~Batch + Group

 formula <- ~Group
 design <- model.matrix(formula, data=pdata)


 #################################### DESeq2 ####################################

 # setup and filtering
 dds <- DESeqDataSetFromMatrix(counts, pdata, formula)
 mcols(dds) <- DataFrame(mcols(dds), fdata)
 dds <- dds[filterByExpr(counts, design), ]

 # normalization and differential expression
 dds <- DESeq(dds, quiet=TRUE)
 results <- as.data.frame(lfcShrink(
    dds, coef=length(resultsNames(dds)), type="apeglm", lfcThreshold=lfc,
    quiet=TRUE
 ))
 # alternate older DESeq2 DE test (often useful)
 # alpha <- ifelse(padj == 1, 0.99999, padj)
 # results <- as.data.frame(results(
 #     dds, name=tail(resultsNames(dds), n=1), alpha=alpha, lfcThreshold=lfc,
 #     altHypothesis="greaterAbs", pAdjustMethod=padj_meth
 # ))

 # with DESeq2 you have to add fdata annotation columns to results manually
 # for example to add fdata column "Symbol"
 # always add columns before sorting below
 if ("Symbol" %in% colnames(fdata)) {
    results$Symbol <- mcols(dds)$Symbol
    results <- results[, c(
        "Symbol", colnames(results)[colnames(results) != "Symbol"]
    )]
 }

 # lfcShrink apeglm s-values equivalent to adjusted p-values (or q-values)
 if (lfc > 0 && ("svalue" %in% colnames(results))) {
    results <- results[order(results$svalue), ]
 } else {
    results <- results[order(results$pvalue), ]
 }

 write.table(
    data.frame("ID"=row.names(results), results), file="deseq2_results.tsv",
    sep="\t", quote=FALSE, row.names=FALSE
 )

 # transform normalized data for downstream analysis
 vst <- varianceStabilizingTransformation(dds, blind=FALSE)
 vsd <- assay(vst)

 # volcano plot
 if ("Symbol" %in% colnames(results)) {
    labels <- results$Symbol
 } else {
    labels <- row.names(results)
 }
 png(
    file="deseq2_volcano.png", width=fig_dim, height=fig_dim, units="in",
    res=fig_res
 )
 EnhancedVolcano(
    results,
    lab=labels,
    x="log2FoldChange",
    y=ifelse(lfc > 0 && ("svalue" %in% colnames(results)), "svalue", "padj"),
    ylab=bquote(~-Log[10]~adjusted~italic(P)),
    # xlim=xlim,
    # ylim=ylim,
    pCutoff=padj,
    FCcutoff=lfc,
    pointSize=3.0,
    labSize=3.0,
    drawConnectors=FALSE,
    colConnectors="grey50",
    title="Differential expression",
    subtitle=paste(
        "DESeq2: RLE normalization, Wald test",
        ifelse(lfc > 0, ", lfcThreshold", "")
    ),
    caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
 )
 invisible(dev.off())


 ##################################### edgeR ####################################

 # setup and filtering
 dge <- DGEList(counts=counts, genes=fdata)
 dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

 # normalization and differential expression
 dge <- calcNormFactors(dge, method="TMM")
 dge <- estimateDisp(dge, design, robust=TRUE)
 fit <- glmQLFit(dge, design, robust=TRUE)
 if (lfc > 0) {
    glt <- glmTreat(fit, coef=ncol(design), lfc=lfc)
 } else {
    glt <- glmQLFTest(fit, coef=ncol(design))
 }
 results <- as.data.frame(
    topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue")
 )

 write.table(
    data.frame("ID"=row.names(results), results), file="edger_results.tsv",
    sep="\t", quote=FALSE, row.names=FALSE
 )

 # transform normalized data for downstream analysis
 cpms <- cpm(dge, log=TRUE)

 # volcano plot
 if ("Symbol" %in% colnames(results)) {
    labels <- results$Symbol
 } else {
    labels <- row.names(results)
 }
 png(
    file="edger_volcano.png", width=fig_dim, height=fig_dim, units="in",
    res=fig_res
 )
 EnhancedVolcano(
    results,
    lab=labels,
    x="logFC",
    y="FDR",
    ylab=bquote(~-Log[10]~adjusted~italic(P)),
    # xlim=xlim,
    # ylim=ylim,
    pCutoff=padj,
    FCcutoff=lfc,
    pointSize=3.0,
    labSize=3.0,
    drawConnectors=FALSE,
    colConnectors="grey50",
    title="Differential expression",
    subtitle=paste(
        "edgeR: TMM normalization, QLF test", ifelse(lfc > 0, ", TREAT", "")
    ),
    caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
 )
 invisible(dev.off())


 ################################## limma-voom ##################################

 # setup and filtering
 dge <- DGEList(counts=counts, genes=fdata)
 dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

 # normalization and differential expression
 dge <- calcNormFactors(dge, method="TMM")
 v <- voom(dge, design)
 fit <- lmFit(v, design)
 if (lfc > 0) {
    fit <- treat(fit, lfc=lfc, robust=TRUE)
    results <- topTreat(
        fit, coef=ncol(design), number=Inf, adjust.method=padj_meth, sort.by="P"
    )
 } else {
    fit <- eBayes(fit, robust=TRUE)
    results <- topTable(
        fit, coef=ncol(design), number=Inf, adjust.method=padj_meth, sort.by="P"
    )
 }

 write.table(
    data.frame("ID"=row.names(results), results), file="voom_results.tsv",
    sep="\t", quote=FALSE, row.names=FALSE
 )

 # transform normalized data for downstream analysis
 cpms <- cpm(dge, log=TRUE)

 # volcano plot
 if ("Symbol" %in% colnames(results)) {
    labels <- results$Symbol
 } else {
    labels <- row.names(results)
 }
 png(
    file="voom_volcano.png", width=fig_dim, height=fig_dim, units="in",
    res=fig_res
 )
 EnhancedVolcano(
    results,
    lab=labels,
    x="logFC",
    y="adj.P.Val",
    ylab=bquote(~-Log[10]~adjusted~italic(P)),
    # xlim=xlim,
    # ylim=ylim,
    pCutoff=padj,
    FCcutoff=lfc,
    pointSize=3.0,
    labSize=3.0,
    drawConnectors=FALSE,
    colConnectors="grey50",
    title="Differential expression",
    subtitle=paste(
        "limma-voom: TMM normalization", ifelse(lfc > 0, ", TREAT", "")
    ),
    caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
 )
 invisible(dev.off())
	suppressPackageStartupMessages({
	library(Biobase)
	library(DESeq2)
	library(edgeR)
	library(EnhancedVolcano)
	library(limma)
	})

	fc <- 1.0
	lfc <- log2(fc)
	padj <- 0.1
	# Benjamini-Hochberg FDR multiple testing correction (standard)
	padj_meth <- "BH"
	fig_dim <- 8
	fig_res <- 300
	xlim <- c(-4, 4)
	ylim <- c(0, 4)

	# data types to have ready or read in from csv/tsv files
	# counts (feature x sample data matrix of integer counts)
	# pdata (sample x phenotypic data dataframe)
	# fdata (feature x feature annotation dataframe)

	# Hugo et al. pretreatment anti-PD1 response dataset example
	eset <- readRDS("srp070710_pre_ensg_htseq_counts.rds")
	counts <- exprs(eset)
	pdata <- pData(eset)
	fdata <- fData(eset)
	pdata$Group <- pdata$Class

	# if you have batches and you want to model them
	# pdata$Batch <- factor(pdata$Batch)

	# pdata column with differential compararison of interest
	# (e.g. subtype, metastatic)
	pdata$Group <- factor(pdata$Group)

	# if you have batches you want to include in model
	# formula <- ~Batch + Group

	formula <- ~Group
	design <- model.matrix(formula, data=pdata)


	#################################### DESeq2 ####################################

	# setup and filtering
	dds <- DESeqDataSetFromMatrix(counts, pdata, formula)
	mcols(dds) <- DataFrame(mcols(dds), fdata)
	dds <- dds[filterByExpr(counts, design), ]

	# normalization and differential expression
	dds <- DESeq(dds, quiet=TRUE)
	results <- as.data.frame(lfcShrink(
	dds, coef=length(resultsNames(dds)), type="apeglm", lfcThreshold=lfc,
	quiet=TRUE
	))
	# alternate older DESeq2 DE test (often useful)
	# alpha <- ifelse(padj == 1, 0.99999, padj)
	# results <- as.data.frame(results(
	# dds, name=tail(resultsNames(dds), n=1), alpha=alpha, lfcThreshold=lfc,
	# altHypothesis="greaterAbs", pAdjustMethod=padj_meth
	# ))

	# with DESeq2 you have to add fdata annotation columns to results manually
	# for example to add fdata column "Symbol"
	# always add columns before sorting below
	if ("Symbol" %in% colnames(fdata)) {
	results$Symbol <- mcols(dds)$Symbol
	results <- results[, c(
	"Symbol", colnames(results)[colnames(results) != "Symbol"]
	)]
	}

	# lfcShrink apeglm s-values equivalent to adjusted p-values (or q-values)
	if (lfc > 0 && ("svalue" %in% colnames(results))) {
	results <- results[order(results$svalue), ]
	} else {
	results <- results[order(results$pvalue), ]
	}

	write.table(
	data.frame("ID"=row.names(results), results), file="deseq2_results.tsv",
	sep="\t", quote=FALSE, row.names=FALSE
	)

	# transform normalized data for downstream analysis
	vst <- varianceStabilizingTransformation(dds, blind=FALSE)
	vsd <- assay(vst)

	# volcano plot
	if ("Symbol" %in% colnames(results)) {
	labels <- results$Symbol
	} else {
	labels <- row.names(results)
	}
	png(
	file="deseq2_volcano.png", width=fig_dim, height=fig_dim, units="in",
	res=fig_res
	)
	EnhancedVolcano(
	results,
	lab=labels,
	x="log2FoldChange",
	y=ifelse(lfc > 0 && ("svalue" %in% colnames(results)), "svalue", "padj"),
	ylab=bquote(~-Log[10]~adjusted~italic(P)),
	# xlim=xlim,
	# ylim=ylim,
	pCutoff=padj,
	FCcutoff=lfc,
	pointSize=3.0,
	labSize=3.0,
	drawConnectors=FALSE,
	colConnectors="grey50",
	title="Differential expression",
	subtitle=paste(
	"DESeq2: RLE normalization, Wald test",
	ifelse(lfc > 0, ", lfcThreshold", "")
	),
	caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
	)
	invisible(dev.off())


	##################################### edgeR ####################################

	# setup and filtering
	dge <- DGEList(counts=counts, genes=fdata)
	dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

	# normalization and differential expression
	dge <- calcNormFactors(dge, method="TMM")
	dge <- estimateDisp(dge, design, robust=TRUE)
	fit <- glmQLFit(dge, design, robust=TRUE)
	if (lfc > 0) {
	glt <- glmTreat(fit, coef=ncol(design), lfc=lfc)
	} else {
	glt <- glmQLFTest(fit, coef=ncol(design))
	}
	results <- as.data.frame(
	topTags(glt, n=Inf, adjust.method=padj_meth, sort.by="PValue")
	)

	write.table(
	data.frame("ID"=row.names(results), results), file="edger_results.tsv",
	sep="\t", quote=FALSE, row.names=FALSE
	)

	# transform normalized data for downstream analysis
	cpms <- cpm(dge, log=TRUE)

	# volcano plot
	if ("Symbol" %in% colnames(results)) {
	labels <- results$Symbol
	} else {
	labels <- row.names(results)
	}
	png(
	file="edger_volcano.png", width=fig_dim, height=fig_dim, units="in",
	res=fig_res
	)
	EnhancedVolcano(
	results,
	lab=labels,
	x="logFC",
	y="FDR",
	ylab=bquote(~-Log[10]~adjusted~italic(P)),
	# xlim=xlim,
	# ylim=ylim,
	pCutoff=padj,
	FCcutoff=lfc,
	pointSize=3.0,
	labSize=3.0,
	drawConnectors=FALSE,
	colConnectors="grey50",
	title="Differential expression",
	subtitle=paste(
	"edgeR: TMM normalization, QLF test", ifelse(lfc > 0, ", TREAT", "")
	),
	caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
	)
	invisible(dev.off())


	################################## limma-voom ##################################

	# setup and filtering
	dge <- DGEList(counts=counts, genes=fdata)
	dge <- dge[filterByExpr(dge, design), , keep.lib.sizes=FALSE]

	# normalization and differential expression
	dge <- calcNormFactors(dge, method="TMM")
	v <- voom(dge, design)
	fit <- lmFit(v, design)
	if (lfc > 0) {
	fit <- treat(fit, lfc=lfc, robust=TRUE)
	results <- topTreat(
	fit, coef=ncol(design), number=Inf, adjust.method=padj_meth, sort.by="P"
	)
	} else {
	fit <- eBayes(fit, robust=TRUE)
	results <- topTable(
	fit, coef=ncol(design), number=Inf, adjust.method=padj_meth, sort.by="P"
	)
	}

	write.table(
	data.frame("ID"=row.names(results), results), file="voom_results.tsv",
	sep="\t", quote=FALSE, row.names=FALSE
	)

	# transform normalized data for downstream analysis
	cpms <- cpm(dge, log=TRUE)

	# volcano plot
	if ("Symbol" %in% colnames(results)) {
	labels <- results$Symbol
	} else {
	labels <- row.names(results)
	}
	png(
	file="voom_volcano.png", width=fig_dim, height=fig_dim, units="in",
	res=fig_res
	)
	EnhancedVolcano(
	results,
	lab=labels,
	x="logFC",
	y="adj.P.Val",
	ylab=bquote(~-Log[10]~adjusted~italic(P)),
	# xlim=xlim,
	# ylim=ylim,
	pCutoff=padj,
	FCcutoff=lfc,
	pointSize=3.0,
	labSize=3.0,
	drawConnectors=FALSE,
	colConnectors="grey50",
	title="Differential expression",
	subtitle=paste(
	"limma-voom: TMM normalization", ifelse(lfc > 0, ", TREAT", "")
	),
	caption=paste("FC cutoff = ", fc, "; FDR cutoff = ", padj, sep="")
	)
	invisible(dev.off())