-
-
Save stephenturner/f60c1934405c127f09a6 to your computer and use it in GitHub Desktop.
## RNA-seq analysis with DESeq2 | |
## Stephen Turner, @genetics_blog | |
# RNA-seq data from GSE52202 | |
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with | |
# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl") | |
# Import & pre-process ---------------------------------------------------- | |
# Import data from featureCounts | |
## Previously ran at command line something like this: | |
## featureCounts -a genes.gtf -o counts.txt -T 12 -t exon -g gene_id GSM*.sam | |
countdata <- read.table("counts.txt", header=TRUE, row.names=1) | |
# Remove first five columns (chr, start, end, strand, length) | |
countdata <- countdata[ ,6:ncol(countdata)] | |
# Remove .bam or .sam from filenames | |
colnames(countdata) <- gsub("\\.[sb]am$", "", colnames(countdata)) | |
# Convert to matrix | |
countdata <- as.matrix(countdata) | |
head(countdata) | |
# Assign condition (first four are controls, second four contain the expansion) | |
(condition <- factor(c(rep("ctl", 4), rep("exp", 4)))) | |
# Analysis with DESeq2 ---------------------------------------------------- | |
library(DESeq2) | |
# Create a coldata frame and instantiate the DESeqDataSet. See ?DESeqDataSetFromMatrix | |
(coldata <- data.frame(row.names=colnames(countdata), condition)) | |
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition) | |
dds | |
# Run the DESeq pipeline | |
dds <- DESeq(dds) | |
# Plot dispersions | |
png("qc-dispersions.png", 1000, 1000, pointsize=20) | |
plotDispEsts(dds, main="Dispersion plot") | |
dev.off() | |
# Regularized log transformation for clustering/heatmaps, etc | |
rld <- rlogTransformation(dds) | |
head(assay(rld)) | |
hist(assay(rld)) | |
# Colors for plots below | |
## Ugly: | |
## (mycols <- 1:length(unique(condition))) | |
## Use RColorBrewer, better | |
library(RColorBrewer) | |
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(condition))]) | |
# Sample distance heatmap | |
sampleDists <- as.matrix(dist(t(assay(rld)))) | |
library(gplots) | |
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20) | |
heatmap.2(as.matrix(sampleDists), key=F, trace="none", | |
col=colorpanel(100, "black", "white"), | |
ColSideColors=mycols[condition], RowSideColors=mycols[condition], | |
margin=c(10, 10), main="Sample Distance Matrix") | |
dev.off() | |
# Principal components analysis | |
## Could do with built-in DESeq2 function: | |
## DESeq2::plotPCA(rld, intgroup="condition") | |
## I like mine better: | |
rld_pca <- function (rld, intgroup = "condition", ntop = 500, colors=NULL, legendpos="bottomleft", main="PCA Biplot", textcx=1, ...) { | |
require(genefilter) | |
require(calibrate) | |
require(RColorBrewer) | |
rv = rowVars(assay(rld)) | |
select = order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] | |
pca = prcomp(t(assay(rld)[select, ])) | |
fac = factor(apply(as.data.frame(colData(rld)[, intgroup, drop = FALSE]), 1, paste, collapse = " : ")) | |
if (is.null(colors)) { | |
if (nlevels(fac) >= 3) { | |
colors = brewer.pal(nlevels(fac), "Paired") | |
} else { | |
colors = c("black", "red") | |
} | |
} | |
pc1var <- round(summary(pca)$importance[2,1]*100, digits=1) | |
pc2var <- round(summary(pca)$importance[2,2]*100, digits=1) | |
pc1lab <- paste0("PC1 (",as.character(pc1var),"%)") | |
pc2lab <- paste0("PC1 (",as.character(pc2var),"%)") | |
plot(PC2~PC1, data=as.data.frame(pca$x), bg=colors[fac], pch=21, xlab=pc1lab, ylab=pc2lab, main=main, ...) | |
with(as.data.frame(pca$x), textxy(PC1, PC2, labs=rownames(as.data.frame(pca$x)), cex=textcx)) | |
legend(legendpos, legend=levels(fac), col=colors, pch=20) | |
# rldyplot(PC2 ~ PC1, groups = fac, data = as.data.frame(pca$rld), | |
# pch = 16, cerld = 2, aspect = "iso", col = colours, main = draw.key(key = list(rect = list(col = colours), | |
# terldt = list(levels(fac)), rep = FALSE))) | |
} | |
png("qc-pca.png", 1000, 1000, pointsize=20) | |
rld_pca(rld, colors=mycols, intgroup="condition", xlim=c(-75, 35)) | |
dev.off() | |
# Get differential expression results | |
res <- results(dds) | |
table(res$padj<0.05) | |
## Order by adjusted p-value | |
res <- res[order(res$padj), ] | |
## Merge with normalized count data | |
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE) | |
names(resdata)[1] <- "Gene" | |
head(resdata) | |
## Write results | |
write.csv(resdata, file="diffexpr-results.csv") | |
## Examine plot of p-values | |
hist(res$pvalue, breaks=50, col="grey") | |
## Examine independent filtering | |
attr(res, "filterThreshold") | |
plot(attr(res,"filterNumRej"), type="b", xlab="quantiles of baseMean", ylab="number of rejections") | |
## MA plot | |
## Could do with built-in DESeq2 function: | |
## DESeq2::plotMA(dds, ylim=c(-1,1), cex=1) | |
## I like mine better: | |
maplot <- function (res, thresh=0.05, labelsig=TRUE, textcx=1, ...) { | |
with(res, plot(baseMean, log2FoldChange, pch=20, cex=.5, log="x", ...)) | |
with(subset(res, padj<thresh), points(baseMean, log2FoldChange, col="red", pch=20, cex=1.5)) | |
if (labelsig) { | |
require(calibrate) | |
with(subset(res, padj<thresh), textxy(baseMean, log2FoldChange, labs=Gene, cex=textcx, col=2)) | |
} | |
} | |
png("diffexpr-maplot.png", 1500, 1000, pointsize=20) | |
maplot(resdata, main="MA Plot") | |
dev.off() | |
## Volcano plot with "significant" genes labeled | |
volcanoplot <- function (res, lfcthresh=2, sigthresh=0.05, main="Volcano Plot", legendpos="bottomright", labelsig=TRUE, textcx=1, ...) { | |
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main=main, ...)) | |
with(subset(res, padj<sigthresh ), points(log2FoldChange, -log10(pvalue), pch=20, col="red", ...)) | |
with(subset(res, abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="orange", ...)) | |
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), points(log2FoldChange, -log10(pvalue), pch=20, col="green", ...)) | |
if (labelsig) { | |
require(calibrate) | |
with(subset(res, padj<sigthresh & abs(log2FoldChange)>lfcthresh), textxy(log2FoldChange, -log10(pvalue), labs=Gene, cex=textcx, ...)) | |
} | |
legend(legendpos, xjust=1, yjust=1, legend=c(paste("FDR<",sigthresh,sep=""), paste("|LogFC|>",lfcthresh,sep=""), "both"), pch=20, col=c("red","orange","green")) | |
} | |
png("diffexpr-volcanoplot.png", 1200, 1000, pointsize=20) | |
volcanoplot(resdata, lfcthresh=1, sigthresh=0.05, textcx=.8, xlim=c(-2.3, 2)) | |
dev.off() |
Thank you!
Hello, Thanks for the code! really helpful for beginners.
When I'm trying to run the pipeline:
dds <- DESeq(dds)
I get this error message:
estimating size factors
Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, :
every gene contains at least one zero, cannot compute log geometric means
Anyone could give me some advice? i cant have zeros?
Hi, I'm struggling with the DESeqDataSetFromMatrix command and can't seem to find similar problems online. I have metagenomic count data and would like to follow the above template for differential abundance analysis.
I use this code:
countdata <- read.table("OTU_family.txt", fill=TRUE, header=TRUE, row.names=1)
countdata <- as.matrix(countdata)
(condition <- factor(c(rep("Natural", 5),rep("Healthy", 7), rep ("Diseased", 7), rep ("Stressed", 7))))
library(DESeq2)
(coldata <- data.frame(row.names=colnames(countdata), condition))
library(SummarizedExperiment)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)
and get this error:
Error in slot(value, what) :
no slot of name "exptData" for this object of class "SummarizedExperiment"
Does anyone have advice on what I'm doing wrong?
Thanks in advance.
When I get the DE results, I get values output for each cell. What are these values?
Thanks a lot for your code 👍
Here's the code that I used:-
library(DESeq2)
countData = read.table(file = 'C:/Users/Dell/Desktop/M.BIOINFORMATICS/FYP/datafr.txt', header = TRUE, sep = '\t', row.names = 1)
colData = read.table(file = 'C:/Users/Dell/Desktop/M.BIOINFORMATICS/FYP/coldata.txt', header = TRUE, sep = '\t', row.names = 1)
colData[['condition']] = factor(colData[['condition']], levels = c('Normal', 'Tumour'))
dataset <- DESeqDataSetFromMatrix(countData=countData, colData=colData, design=~condition)
And here's the error that I'm getting:-
dataset <- DESeqDataSetFromMatrix(countData = countData,colData = colData,design = ~condition)
Error in DESeqDataSet(se, design = design, ignoreRank) :
some values in assay are negative
The countData is basically a breast cancer NGS dataset which comprises of 11 normal samples and 62 tumour samples. There are 1391 genes (rows) in total. There are negative values in the original dataset. I'm new to R and trying to cope up. Can anyone please tell me why there shouldn't be any negative values? Am i missing any steps? Please help me out.
Thanks a lot for the script. It really helped to get me started with the analysis.
@ruby23 There shouldn't be any negative values because the DESeq2 package requires raw counts. That means, you should have only positive integer values or zeros in your data. Since you probably didn't acquire the NGS data yourself, make sure that you use the raw counts and not some already normalised or log-transformed values.
@aforntacc As above should be no negative counts, min NGS "count" is zero :-). The template script assumes your counts are in a particular column, but other columns will have things like log2 changes. Check your input file and then modify this line accordingly.
countdata <- countdata[ ,6:ncol(countdata)]
Great consolidated script. Works seamlessly for me up until maplot and volcanoplot however. I receive the following error using the unedited script above:
Error in plot.window(...) : need finite 'xlim' values
Calls: plot -> plot -> plot.default -> localWindow -> plot.window
In addition: Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
....
I've both checked my input for completely null/NA entries, and have attempted to edit script, by adding explicit xlim call (i.e., xlim=c(-2.5,2),) at line 126 (maplot) and line 139 (volcanoplot), however to no avail. What else could I be missing/am I not addressing?
Thanks for this script.... A great help for me ..
A week of hitting a wall and then I came across this. Thank you!!!!
I recommend this template to everyone who is starting with DE analysis. It is really good. Thanks a ton for it.
Thank you, this is a very helpful introduction to DESeq2!
Thank you, this was extremely helpful.
Thank you, this was extremely helpful.
Hi thanks for sharing this code. I'm starting to use DESeq2 in command line in R. Basically I can understand how to fuse featureCounts output into one matrix (I will use counts file generated in Galaxy), but this misses the coldata info and I was trying to search how to create it and put it into the deseqdataset object.
Basically, you create it between line 25 and 34 if I understand correctly. I want to assign 2 different batches as conditions and in each I have 3 conditions (not similar between 2 batches) first batch has duplicates and 2nd has triplicates.
so should I use:
(condition <- factor(c(rep("cond1", 2), rep("cond2", 2), rep("cond3", 2), rep("cond4", 3), rep("cond5", 3), rep("cond6", 3))))
(batch <- factor(c(rep("batch1", 6), rep("batch2", 9))))
(coldata <- data.frame(row.names=colnames(countdata), condition, batch))
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition, batch)
is this correct? also I'm not sure why we need brackets from the beginning of these lines?
thank you for any help,
best
please how do i solve this sort of error
Error in read.table(file = file, header = header, sep = sep, quote = quote, :
duplicate 'row.names' are not allowed
Could anyone tell me how to download the template RNA-seq data from GSE52202? I tried the GEOquery package in R and unzipped the GSE52202_iPSCMN_RPKM_RS10162012.txt.gz. After that, I only received the GSE52202_iPSCMN_RPKM_RS10162012.txt file, which seemed not like the counts.txt that described in the R script. Thanks!
Super helpful. I am ran dds <- DESeqDataSetFromMatrix(countData= cc_all, colData=coldata, design= ~condition) just now and it returned : an error message saying NA values are not allowed in the count matrix. Should I use DESeqTransform or simply removed the NA values. Changing them to "0" is not recommended?
yeah, this code helped a lot with my current project.
Super helpful. I am ran dds <- DESeqDataSetFromMatrix(countData= cc_all, colData=coldata, design= ~condition) just now and it returned : an error message saying NA values are not allowed in the count matrix. Should I use DESeqTransform or simply removed the NA values. Changing them to "0" is not recommended?
- For my project I am using ~mode_of_action. Also, I had to go back and fix my counts file with Multmerge and get rid of all my 0's before running DESeq since there is division (which will return NA's when there are 0's)
Hi,
Thanks for sharing this. I was wondering how I can regress out batch effect using DESeq analysis. Didn't find any argument which is specified for this.
Thanks
Great code!
-> to answer to "aforntacc" on May 28, 2016 : you probably do not give count data, but something else, since the function 'DESeqDataSetFromMatrix' crashes. The error message is "some values in assay are negative", so I think it is quite clear that you have negative values, something not possible in count data.