Skip to content

Instantly share code, notes, and snippets.

@wdecoster
Last active December 9, 2016 17:26
Show Gist options
  • Select an option

  • Save wdecoster/781d1df0a45307bb44c1642ae30bf85c to your computer and use it in GitHub Desktop.

Select an option

Save wdecoster/781d1df0a45307bb44c1642ae30bf85c to your computer and use it in GitHub Desktop.
#!/usr/bin/Rscript
#Script to automate differential expression analysis using one of the following R algorithms: DESeq2, edgeR, DEXSeq or limma
#Based on manuals, pieces of code found on the internet and helpful comments of colleagues
###Required input is:###
#1) Either a matrix of counts (features*samples) with features (genes) on lines and samples on columns OR a directory of bam files to use featureCounts on
#2) A sampleInfo File matching the samples containing at least the fields 'condition' and 'gender'. Reference level is 'CON'
######
#Author: wdecoster
#Email: wouter.decoster AT molgen DOT vib-ua DOT be
#Twitter: @wouter_decoster
sanityCheck <- function(arguments) {
if (length(arguments) != 2) { stop("\nInput Error: first argument is countfile, second argument is sample info file.\n") }
countsData = arguments[1]
sampleInfoFile = arguments[2]
cat("Validating the input files. ")
if (file.exists(sampleInfoFile)) {
sampleInfo <- read.table(sampleInfoFile, header=T)
if (!"condition" %in% names(sampleInfo)) {stop('\n\n ERROR: Could not find required field <condition> in sample info file.')} #A required field in sample file is "condition"
if (!"gender" %in% names(sampleInfo)) {stop('\n\n ERROR: Could not find required field <gender> in sample info file.')} #A required field in sample file is "gender"
if ("batch" %in% names(sampleInfo)) { batchbool <- TRUE } else { batchbool <- FALSE } #Check if batch is a field in the sampleInfoFile
conditions = unique(sampleInfo$condition)
cat('Found ', length(conditions), ' conditions: ', paste(conditions, collapse='&'), '.\n', sep='')
} else {
stop("\n\nERROR: File provided as sample info file doesn't exist or path is incorrect.") }
if (dir.exists(countsData)) { #first have to check for dir.exists since file.exists isn't specific. If argument is a directory, use featurecounts on the bams
cat("Using featurecounts with GRCh37 annotation. ")
bams <- dir(path = countsData, pattern ='.bam$', full.names=TRUE)
cat("Found ", length(bams), " bamfiles to perform counting on.\n")
if (nrow(sampleInfo) == length(bams)) {
cat('Found', nrow(sampleInfo), 'samples. ')
} else {
stop('\n\nERROR: Mismatch between number of samples in bam directory [', length(bams), '] and samples in sample info file [', nrow(sampleInfo),']\n') }
counts <- counting(bams, as.character(sampleInfo$sample))
} else if (file.exists(countsData)) { #If countData isn't a directory of bams, it should be a file
cat("Using a countsfile. ") #Assumed is a n*m table with feature names as rownames and samples as column names to generate a matrix
counts <- read.table(countsData, header=TRUE, stringsAsFactors=FALSE, row.names=1)
if (ncol(counts) == nrow(sampleInfo)) {
cat('Found', nrow(sampleInfo), 'samples. ')
} else {
stop('\n\nERROR: Mismatch between number of samples in countfile [', ncol(counts), '] and samples in sample info file [', nrow(sampleInfo),']\n') }
} else {
stop("\n\nERROR: File or directory provided as countsfile doesn't exist or path is incorrect.") }
return(list(counts, sampleInfo, batchbool))
}
counting <- function(bams, samples) { #In the case a directory of bam files is provided, use featureCounts to count
suppressMessages(library(Rsubread))
annotation <- "/home/wdecoster/databases/gffEnsembl/Homo_sapiens.GRCh37.82.chr.gtf"
if (file.exists(annotation)) {
suppressMessages(counts <- featureCounts(bams, annot.ext=annotation, isGTFAnnotationFile=TRUE, strandSpecific=1, nthreads=12, GTF.featureType="gene")) #notice counting on gene level and using only 4 threads, can be adapted depending on needs
} else {
stop("FATAL: I could not find my database file (hardcoded in function 'counting'). That's a problem.\n")}
RawCount <- cbind(feature=rownames(counts$counts), counts$counts)
colnames(RawCount) <- c("feature", samples)
write.table(RawCount, "FeatureCounts_RawCounts.txt", quote=F, sep="\t", row.names=F)
relativeStats <- cbind(Status = paste(counts$stat$Status[2:11], "relative", sep="_"), counts$stat[2:11, 2:ncol(counts$stat)] / as.numeric(counts$stat[1,2:ncol(counts$stat)]))
completeStats <- rbind(counts$stat, relativeStats)
names(completeStats) <- c("Status", samples)
write.table(completeStats, "FeatureCounts_CountingStatistics.txt", quote=F, sep="\t", row.names=F)
return(counts$counts) }
ens2symbol <- function(dearesult) { #convert ensembl identifiers to gene symbols using 'annotables'
interm <- cbind(gene=row.names(dearesult), as.data.frame(dearesult), stringsAsFactors=FALSE) #To convert the row names (features) to a column in the dataframe.
interm$symbol=sapply(sapply(strsplit(interm$gene,","),function(x) grch38[grch38$ensgene %in% x,"symbol"]),paste,collapse=",") #Comma separated overlapping featured are matched to gene symbols and repasted together
return(interm)
}
proc_limma_voom <- function(data) {
counts = data[[1]]
sampleInfo = data[[2]]
batchbool = data[[3]]
suppressMessages(library("limma"))
suppressMessages(library("edgeR"))
dge <- calcNormFactors(DGEList(counts=counts))
conditions <- factor(sampleInfo$condition)
gender <- factor(sampleInfo$gender)
if (batchbool) {
batch <- factor(sampleInfo$batch)
design <- model.matrix(~batch+gender+conditions)
} else {
design <- model.matrix(~gender+conditions)
}
#For outliers, use sample quality weights
vwts <- voomWithQualityWeights(dge, design=design, normalization="none")
normcounts <- data.frame(feature=rownames(vwts$E), vwts$E)
fit <- eBayes(lmFit(vwts))
degTable <- topTable(fit,number=Inf, coef=ncol(design))
if (substr(rownames(degTable)[1], 1, 4) == 'ENSG'){ #In the case of ensembl identifiers, convert those
output <- ens2symbol(degTable)[,c('gene', 'logFC', 'P.Value', 'adj.P.Val', 'symbol')] #Select columns of interest
} else {
outputtemp <- cbind(gene=row.names(degTable), as.data.frame(degTable)) #Select columns of interest
outputtemp$symbol <- "NA"
output <- outputtemp[,c('gene', 'logFC', 'P.Value', 'adj.P.Val', 'symbol')]
}
colnames(output) <- c("gene", "logFC", "pvalue", "padj", "symbol") #Rename columns to harmonize with other tools
makevolcanoplot(mutate(output, sig=ifelse(output$padj<0.1, "padj<0.1", "Not Sig")), "Limma-voom") #Call volcanoplot function
DEG <- subset(output, padj < 0.1)
cat(paste("Found", nrow(subset(DEG, logFC > 0)), "upregulated genes and", nrow(subset(DEG, logFC < 0)), "downregulated genes using limma.\n", collapse=" ")) #Print a simple summary statement
write.table(as.data.frame(output), file="Limma-voom_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE) #Save result
write.table(as.data.frame(normcounts), file="Limma-voom_normalizedcounts.txt", sep="\t", row.names=FALSE, quote=FALSE) #Save normalized counts
write.table(as.data.frame(DEG$symbol), file="Limma-voom_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple significant gene list with the purpose of doing enrichment analysis or plotting a Venn diagram
}
proc_deseq2 <- function(data) {
counts = data[[1]]
sampleInfo = data[[2]]
batchbool = data[[3]]
suppressMessages(library("DESeq2"))
suppressMessages(library("BiocParallel"))
register(MulticoreParam(4))
if (batchbool) {
deseqdata <- DESeqDataSetFromMatrix(countData=counts, colData=sampleInfo, design=~batch+gender+condition)
} else {
deseqdata <- DESeqDataSetFromMatrix(countData=counts, colData=sampleInfo, design=~gender+condition)
}
dds <- deseqdata[rowSums(counts(deseqdata)) > 1,] #To prefilter the data and remove lowest counts
dds$condition <- relevel(dds$condition, ref="CON") #Need to find a way to deduce this from the sampleInfo, but have to chose the right one
dds <- DESeq(dds, quiet=TRUE, parallel=TRUE)
res <- results(dds, parallel=TRUE)
cat('\n\nSummary data from DESeq2:')
summary(res)
resOrdered <- res[order(res$padj),]
#MAplot
jpeg('DESeq2_MAplot.jpeg', width=8, height=8, units="in", res=500)
DESeq2::plotMA(dds, main="DESeq2", ylim=c(-2,2))
dev.off()
#Dispersionplot
jpeg('DESeq2_Dispersionplot.jpeg', width=8, height=8, units="in", res=500)
plotDispEsts(dds)
dev.off()
if (substr(rownames(resOrdered)[1], 1, 4) == 'ENSG'){ #In the case of ensembl identifiers, convert those
output <- ens2symbol(resOrdered)[, c("gene", "log2FoldChange", "pvalue", "padj", "symbol")]
} else {
outputtemp <- cbind(gene=row.names(resOrdered), as.data.frame(resOrdered))
outputtemp$symbol = "NA"
output <- outputtemp[, c("gene", "log2FoldChange", "pvalue", "padj", "symbol")]
}
colnames(output) <- c("gene", "logFC", "pvalue", "padj", "symbol")
DEG <- subset(output, padj < 0.1)
rld <- rlogTransformation(dds, blind=FALSE)
rlddf <- data.frame(names(dds), assay(rld))
colnames(rlddf) <- c("feature", colnames(counts))
write.table(as.data.frame(output), file="DESeq2_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE)
write.table(as.data.frame(DEG$symbol), file="DESeq2_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple gene list with the purpose of doing enrichment analysis or plotting a Venn diagram
write.table(as.data.frame(rlddf), file="DESeq2_rlognormalizedcounts.txt", sep="\t", row.names=FALSE, quote=FALSE)
makevolcanoplot(mutate(output, sig=ifelse(output$padj<0.1, "padj<0.1", "Not Sig")), "DESeq2")
#SamplesHeatMap_rld
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$condition, sep="-")
colnames(sampleDistMatrix) <- NULL
jpeg('DESeq2_SamplesHeatMap_rld.jpeg', width=8, height=8, units="in", res=500)
pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255), fontsize_row=6)
dev.off()
#PCAplot_rld + pca scree plot
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(assay(rld)[select, ])
percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
scree_plot=data.frame(percentVar)
scree_plot[1:10,2]<- c(1:10)
colnames(scree_plot)<-c("variance","component_number")
scree <- ggplot(scree_plot[1:10,], mapping=aes(x=component_number, y=variance))+geom_bar(stat="identity")
suppressMessages(ggsave('DESeq2_PCA_scree.jpeg', device="jpeg"))
pcadata <- plotPCA(rld, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcadata, "percentVar"))
pca <- ggplot(pcadata, aes(PC1, PC2, color=condition, shape=condition)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance"))
suppressMessages(ggsave('DESeq2_PCAplot_rld.jpeg', pca, device = "jpeg"))
}
proc_edger <- function(data) {
counts = data[[1]]
sampleInfo = data[[2]]
batchbool = data[[3]]
suppressMessages(library("edgeR"))
conditions <- factor(sampleInfo$condition)
gender <- factor(sampleInfo$gender)
if (batchbool) {
batch <- factor(sampleInfo$batch)
design <- model.matrix(~batch+gender+conditions)
} else {
design <- model.matrix(~gender+conditions)
}
d <- DGEList(counts=data.matrix(counts), group=conditions)
d$samples$group <- relevel(d$samples$group, ref="CON")
dlim <- d[rowSums(cpm(d)>1) >= 2,]
disp <- estimateDisp(calcNormFactors(dlim), design) #Common dispersion and tagwise dispersions in one run
rownames(design) <- colnames(disp)
fit <- glmFit(disp,design) #Likelihood ratio tests
deg <- glmLRT(fit, coef=ncol(fit$design))
degTable <- topTags(deg, n=nrow(deg$counts))$table
#MAplot
jpeg('edgeR_MAplot.jpeg', width=8, height=8, units="in", res=500)
plotSmear(deg, de.tags=rownames(disp)[as.logical(decideTestsDGE(deg))])
abline(h=c(-1, 1), col="blue")
dev.off()
#Dispersionplot
jpeg('edgeR_dispersionPlot.jpeg', width=8, height=8, units="in", res=500)
plotBCV(disp)
dev.off()
if (substr(rownames(degTable)[1], 1, 4) == 'ENSG'){
output <- ens2symbol(degTable)[,c('gene', 'logFC', 'PValue', 'FDR', 'symbol')]
} else {
outputtemp <- cbind(gene=row.names(degTable), as.data.frame(degTable))
outputtemp$symbol <- "NA"
output <- outputtemp[,c('gene', 'logFC', 'PValue', 'FDR', 'symbol')] }
colnames(output) <- c("gene", "logFC", "pvalue", "FDR", "symbol")
DEG <- subset(output, FDR < 0.1)
write.table(output, file="edgeR_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE)
write.table(as.data.frame(DEG$symbol), file="edgeR_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple gene list with the purpose of doing enrichment analysis or plotting a Venn diagram
cat(paste("Found", nrow(subset(DEG, logFC > 0)), "upregulated genes and", nrow(subset(DEG, logFC < 0)), "downregulated genes with edgeR.\n", collapse=" "))
makevolcanoplot(mutate(output, sig=ifelse(output$FDR<0.1, "FDR<0.1", "Not Sig")), "edgeR")
}
makevolcanoplot <- function(input, proc) { #Create volcanoplot of results, with labels and colors
volc = ggplot(input, aes(logFC, -log10(pvalue))) + geom_point(aes(col=sig)) + scale_color_manual(values=c("black", "red")) + ggtitle(proc)
if (substr(input$gene[1], 1, 4) == 'ENSG'){
volc+geom_text_repel(data=head(input, 20), aes(label=symbol)) #If ENSG is present, the results were converted to symbol notation earlier and can use these
} else {
volc+geom_text_repel(data=head(input, 20), aes(label=gene)) }
suppressMessages(ggsave(paste(proc, "Volcanoplot.jpeg", sep="_"), device="jpeg"))
}
indata = sanityCheck(unlist(strsplit(commandArgs(trailingOnly = TRUE)," "))) #Function to validate the structure of the supplied datafiles and returned as list.
suppressMessages(library("annotables"))
suppressMessages(library("pheatmap"))
suppressMessages(library("RColorBrewer"))
suppressMessages(library("ggplot2"))
suppressMessages(library("dplyr"))
suppressMessages(library("genefilter"))
suppressMessages(library("rgl"))
suppressMessages(library("ggrepel"))
suppressMessages(library("matrixStats"))
for (func in c(proc_edger, proc_limma_voom, proc_deseq2)) {func(indata)}
cat("\nFinished!\n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment