Forked from stephenturner/deseq2-analysis-template.R
Created
December 11, 2018 13:51
-
-
Save nilesh-tawari/7aa2601186d8d2c4e9e8ae8822e052fa to your computer and use it in GitHub Desktop.
Template for analysis with DESeq2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## RNA-seq analysis with DESeq2 | |
## Stephen Turner, @genetics_blog | |
# RNA-seq data from GSE52202 | |
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with | |
# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl") | |
# Import & pre-process ---------------------------------------------------- | |
# Import data from featureCounts | |
## Previously ran at command line something like this: | |
## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam | |
countdata <- read.table("counts.txt", header=TRUE, row.names=1) | |
# Remove first five columns (chr, start, end, strand, length) | |
countdata <- countdata[ ,6:ncol(countdata)] | |
# Remove .bam or .sam from filenames | |
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata)) | |
# Convert to matrix | |
countdata <- as.matrix(countdata) | |
head(countdata) | |
# Assign condition (first four are controls, second four contain the expansion) | |
(condition <- factor(c(rep("ctl", 4), rep("exp", 4)))) | |
# Analysis with DESeq2 ---------------------------------------------------- | |
library(DESeq2) | |
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix | |
(coldata <- data.frame(row.names=colnames(countdata), condition)) | |
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) | |
dds | |
# Run the DESeq pipeline | |
dds <- DESeq(dds) | |
# Plot dispersions | |
png("qc-dispersions.png", 1000, 1000, pointsize=20) | |
plotDispEsts(dds, main="Dispersion plot") | |
dev.off() | |
# Regularized log transformation for clustering/heatmaps, etc | |
rld <- rlogTransformation(dds) | |
head(assay(rld)) | |
hist(assay(rld)) | |
# Colors for plots below | |
## Ugly: | |
## (mycols <- 1:length(unique(condition))) | |
## Use RColorBrewer, better | |
library(RColorBrewer) | |
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))]) | |
# Sample distance heatmap | |
sampleDists <- as.matrix(dist(t(assay(rld)))) | |
library(gplots) | |
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20) | |
heatmap.2(as.matrix(sampleDists), key=F, trace="none", | |
col=colorpanel(100, "black", "white"), | |
ColSideColors=mycols[condition], RowSideColors=mycols[condition], | |
margin=c(10, 10), main="Sample Distance Matrix") | |
dev.off() | |
# Principal components analysis | |
## Could do with built-in DESeq2 function: | |
## DESeq2::plotPCA(rld, intgroup="condition") | |
## I like mine better: | |
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) { | |
require(genefilter) | |
require(calibrate) | |
require(RColorBrewer) | |
rv = rowVars(assay(rld)) | |
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] | |
pca = prcomp(t(assay(rld)[select, ])) | |
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : ")) | |
if (is.null(colors)) { | |
if (nlevels(fac) >= 3) { | |
colors = brewer.pal(nlevels(fac), "Paired") | |
} else { | |
colors = c("black", "red") | |
} | |
} | |
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1) | |
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1) | |
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)") | |
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)") | |
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...) | |
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx)) | |
legend(legendpos, legend=levels(fac), col=colors, pch=20) | |
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld), | |
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours), | |
# terldt = list(levels(fac)), rep = FALSE))) | |
} | |
png("qc-pca.png", 1000, 1000, pointsize=20) | |
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35)) | |
dev.off() | |
# Get differential expression results | |
res <- results(dds) | |
table(res$padj<0.05) | |
## Order by adjusted p-value | |
res <- res[order(res$padj), ] | |
## Merge with normalized count data | |
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) | |
names(resdata)[1] <- "Gene" | |
head(resdata) | |
## Write results | |
write.csv(resdata, file="diffexpr-results.csv") | |
## Examine plot of p-values | |
hist(res$pvalue, breaks=50, col="grey") | |
## Examine independent filtering | |
attr(res, "filterThreshold") | |
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections") | |
## MA plot | |
## Could do with built-in DESeq2 function: | |
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1) | |
## I like mine better: | |
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) { | |
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...)) | |
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5)) | |
if (labelsig) { | |
require(calibrate) | |
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2)) | |
} | |
} | |
png("diffexpr-maplot.png", 1500, 1000, pointsize=20) | |
maplot(resdata, main="MA Plot") | |
dev.off() | |
## Volcano plot with "significant" genes labeled | |
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) { | |
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...)) | |
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...)) | |
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...)) | |
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...)) | |
if (labelsig) { | |
require(calibrate) | |
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...)) | |
} | |
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green")) | |
} | |
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20) | |
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2)) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment