Last active
December 9, 2016 17:26
-
-
Save wdecoster/781d1df0a45307bb44c1642ae30bf85c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/Rscript | |
| #Script to automate differential expression analysis using one of the following R algorithms: DESeq2, edgeR, DEXSeq or limma | |
| #Based on manuals, pieces of code found on the internet and helpful comments of colleagues | |
| ###Required input is:### | |
| #1) Either a matrix of counts (features*samples) with features (genes) on lines and samples on columns OR a directory of bam files to use featureCounts on | |
| #2) A sampleInfo File matching the samples containing at least the fields 'condition' and 'gender'. Reference level is 'CON' | |
| ###### | |
| #Author: wdecoster | |
| #Email: wouter.decoster AT molgen DOT vib-ua DOT be | |
| #Twitter: @wouter_decoster | |
| sanityCheck <- function(arguments) { | |
| if (length(arguments) != 2) { stop("\nInput Error: first argument is countfile, second argument is sample info file.\n") } | |
| countsData = arguments[1] | |
| sampleInfoFile = arguments[2] | |
| cat("Validating the input files. ") | |
| if (file.exists(sampleInfoFile)) { | |
| sampleInfo <- read.table(sampleInfoFile, header=T) | |
| if (!"condition" %in% names(sampleInfo)) {stop('\n\n ERROR: Could not find required field <condition> in sample info file.')} #A required field in sample file is "condition" | |
| if (!"gender" %in% names(sampleInfo)) {stop('\n\n ERROR: Could not find required field <gender> in sample info file.')} #A required field in sample file is "gender" | |
| if ("batch" %in% names(sampleInfo)) { batchbool <- TRUE } else { batchbool <- FALSE } #Check if batch is a field in the sampleInfoFile | |
| conditions = unique(sampleInfo$condition) | |
| cat('Found ', length(conditions), ' conditions: ', paste(conditions, collapse='&'), '.\n', sep='') | |
| } else { | |
| stop("\n\nERROR: File provided as sample info file doesn't exist or path is incorrect.") } | |
| if (dir.exists(countsData)) { #first have to check for dir.exists since file.exists isn't specific. If argument is a directory, use featurecounts on the bams | |
| cat("Using featurecounts with GRCh37 annotation. ") | |
| bams <- dir(path = countsData, pattern ='.bam$', full.names=TRUE) | |
| cat("Found ", length(bams), " bamfiles to perform counting on.\n") | |
| if (nrow(sampleInfo) == length(bams)) { | |
| cat('Found', nrow(sampleInfo), 'samples. ') | |
| } else { | |
| stop('\n\nERROR: Mismatch between number of samples in bam directory [', length(bams), '] and samples in sample info file [', nrow(sampleInfo),']\n') } | |
| counts <- counting(bams, as.character(sampleInfo$sample)) | |
| } else if (file.exists(countsData)) { #If countData isn't a directory of bams, it should be a file | |
| cat("Using a countsfile. ") #Assumed is a n*m table with feature names as rownames and samples as column names to generate a matrix | |
| counts <- read.table(countsData, header=TRUE, stringsAsFactors=FALSE, row.names=1) | |
| if (ncol(counts) == nrow(sampleInfo)) { | |
| cat('Found', nrow(sampleInfo), 'samples. ') | |
| } else { | |
| stop('\n\nERROR: Mismatch between number of samples in countfile [', ncol(counts), '] and samples in sample info file [', nrow(sampleInfo),']\n') } | |
| } else { | |
| stop("\n\nERROR: File or directory provided as countsfile doesn't exist or path is incorrect.") } | |
| return(list(counts, sampleInfo, batchbool)) | |
| } | |
| counting <- function(bams, samples) { #In the case a directory of bam files is provided, use featureCounts to count | |
| suppressMessages(library(Rsubread)) | |
| annotation <- "/home/wdecoster/databases/gffEnsembl/Homo_sapiens.GRCh37.82.chr.gtf" | |
| if (file.exists(annotation)) { | |
| suppressMessages(counts <- featureCounts(bams, annot.ext=annotation, isGTFAnnotationFile=TRUE, strandSpecific=1, nthreads=12, GTF.featureType="gene")) #notice counting on gene level and using only 4 threads, can be adapted depending on needs | |
| } else { | |
| stop("FATAL: I could not find my database file (hardcoded in function 'counting'). That's a problem.\n")} | |
| RawCount <- cbind(feature=rownames(counts$counts), counts$counts) | |
| colnames(RawCount) <- c("feature", samples) | |
| write.table(RawCount, "FeatureCounts_RawCounts.txt", quote=F, sep="\t", row.names=F) | |
| relativeStats <- cbind(Status = paste(counts$stat$Status[2:11], "relative", sep="_"), counts$stat[2:11, 2:ncol(counts$stat)] / as.numeric(counts$stat[1,2:ncol(counts$stat)])) | |
| completeStats <- rbind(counts$stat, relativeStats) | |
| names(completeStats) <- c("Status", samples) | |
| write.table(completeStats, "FeatureCounts_CountingStatistics.txt", quote=F, sep="\t", row.names=F) | |
| return(counts$counts) } | |
| ens2symbol <- function(dearesult) { #convert ensembl identifiers to gene symbols using 'annotables' | |
| interm <- cbind(gene=row.names(dearesult), as.data.frame(dearesult), stringsAsFactors=FALSE) #To convert the row names (features) to a column in the dataframe. | |
| interm$symbol=sapply(sapply(strsplit(interm$gene,","),function(x) grch38[grch38$ensgene %in% x,"symbol"]),paste,collapse=",") #Comma separated overlapping featured are matched to gene symbols and repasted together | |
| return(interm) | |
| } | |
| proc_limma_voom <- function(data) { | |
| counts = data[[1]] | |
| sampleInfo = data[[2]] | |
| batchbool = data[[3]] | |
| suppressMessages(library("limma")) | |
| suppressMessages(library("edgeR")) | |
| dge <- calcNormFactors(DGEList(counts=counts)) | |
| conditions <- factor(sampleInfo$condition) | |
| gender <- factor(sampleInfo$gender) | |
| if (batchbool) { | |
| batch <- factor(sampleInfo$batch) | |
| design <- model.matrix(~batch+gender+conditions) | |
| } else { | |
| design <- model.matrix(~gender+conditions) | |
| } | |
| #For outliers, use sample quality weights | |
| vwts <- voomWithQualityWeights(dge, design=design, normalization="none") | |
| normcounts <- data.frame(feature=rownames(vwts$E), vwts$E) | |
| fit <- eBayes(lmFit(vwts)) | |
| degTable <- topTable(fit,number=Inf, coef=ncol(design)) | |
| if (substr(rownames(degTable)[1], 1, 4) == 'ENSG'){ #In the case of ensembl identifiers, convert those | |
| output <- ens2symbol(degTable)[,c('gene', 'logFC', 'P.Value', 'adj.P.Val', 'symbol')] #Select columns of interest | |
| } else { | |
| outputtemp <- cbind(gene=row.names(degTable), as.data.frame(degTable)) #Select columns of interest | |
| outputtemp$symbol <- "NA" | |
| output <- outputtemp[,c('gene', 'logFC', 'P.Value', 'adj.P.Val', 'symbol')] | |
| } | |
| colnames(output) <- c("gene", "logFC", "pvalue", "padj", "symbol") #Rename columns to harmonize with other tools | |
| makevolcanoplot(mutate(output, sig=ifelse(output$padj<0.1, "padj<0.1", "Not Sig")), "Limma-voom") #Call volcanoplot function | |
| DEG <- subset(output, padj < 0.1) | |
| cat(paste("Found", nrow(subset(DEG, logFC > 0)), "upregulated genes and", nrow(subset(DEG, logFC < 0)), "downregulated genes using limma.\n", collapse=" ")) #Print a simple summary statement | |
| write.table(as.data.frame(output), file="Limma-voom_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE) #Save result | |
| write.table(as.data.frame(normcounts), file="Limma-voom_normalizedcounts.txt", sep="\t", row.names=FALSE, quote=FALSE) #Save normalized counts | |
| write.table(as.data.frame(DEG$symbol), file="Limma-voom_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple significant gene list with the purpose of doing enrichment analysis or plotting a Venn diagram | |
| } | |
| proc_deseq2 <- function(data) { | |
| counts = data[[1]] | |
| sampleInfo = data[[2]] | |
| batchbool = data[[3]] | |
| suppressMessages(library("DESeq2")) | |
| suppressMessages(library("BiocParallel")) | |
| register(MulticoreParam(4)) | |
| if (batchbool) { | |
| deseqdata <- DESeqDataSetFromMatrix(countData=counts, colData=sampleInfo, design=~batch+gender+condition) | |
| } else { | |
| deseqdata <- DESeqDataSetFromMatrix(countData=counts, colData=sampleInfo, design=~gender+condition) | |
| } | |
| dds <- deseqdata[rowSums(counts(deseqdata)) > 1,] #To prefilter the data and remove lowest counts | |
| dds$condition <- relevel(dds$condition, ref="CON") #Need to find a way to deduce this from the sampleInfo, but have to chose the right one | |
| dds <- DESeq(dds, quiet=TRUE, parallel=TRUE) | |
| res <- results(dds, parallel=TRUE) | |
| cat('\n\nSummary data from DESeq2:') | |
| summary(res) | |
| resOrdered <- res[order(res$padj),] | |
| #MAplot | |
| jpeg('DESeq2_MAplot.jpeg', width=8, height=8, units="in", res=500) | |
| DESeq2::plotMA(dds, main="DESeq2", ylim=c(-2,2)) | |
| dev.off() | |
| #Dispersionplot | |
| jpeg('DESeq2_Dispersionplot.jpeg', width=8, height=8, units="in", res=500) | |
| plotDispEsts(dds) | |
| dev.off() | |
| if (substr(rownames(resOrdered)[1], 1, 4) == 'ENSG'){ #In the case of ensembl identifiers, convert those | |
| output <- ens2symbol(resOrdered)[, c("gene", "log2FoldChange", "pvalue", "padj", "symbol")] | |
| } else { | |
| outputtemp <- cbind(gene=row.names(resOrdered), as.data.frame(resOrdered)) | |
| outputtemp$symbol = "NA" | |
| output <- outputtemp[, c("gene", "log2FoldChange", "pvalue", "padj", "symbol")] | |
| } | |
| colnames(output) <- c("gene", "logFC", "pvalue", "padj", "symbol") | |
| DEG <- subset(output, padj < 0.1) | |
| rld <- rlogTransformation(dds, blind=FALSE) | |
| rlddf <- data.frame(names(dds), assay(rld)) | |
| colnames(rlddf) <- c("feature", colnames(counts)) | |
| write.table(as.data.frame(output), file="DESeq2_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE) | |
| write.table(as.data.frame(DEG$symbol), file="DESeq2_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple gene list with the purpose of doing enrichment analysis or plotting a Venn diagram | |
| write.table(as.data.frame(rlddf), file="DESeq2_rlognormalizedcounts.txt", sep="\t", row.names=FALSE, quote=FALSE) | |
| makevolcanoplot(mutate(output, sig=ifelse(output$padj<0.1, "padj<0.1", "Not Sig")), "DESeq2") | |
| #SamplesHeatMap_rld | |
| sampleDists <- dist(t(assay(rld))) | |
| sampleDistMatrix <- as.matrix(sampleDists) | |
| rownames(sampleDistMatrix) <- paste(rld$condition, sep="-") | |
| colnames(sampleDistMatrix) <- NULL | |
| jpeg('DESeq2_SamplesHeatMap_rld.jpeg', width=8, height=8, units="in", res=500) | |
| pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, col=colorRampPalette( rev(brewer.pal(9, "Blues")) )(255), fontsize_row=6) | |
| dev.off() | |
| #PCAplot_rld + pca scree plot | |
| rv <- rowVars(assay(rld)) | |
| select <- order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))] | |
| pca <- prcomp(assay(rld)[select, ]) | |
| percentVar <- pca$sdev^2 / sum( pca$sdev^2 ) | |
| scree_plot=data.frame(percentVar) | |
| scree_plot[1:10,2]<- c(1:10) | |
| colnames(scree_plot)<-c("variance","component_number") | |
| scree <- ggplot(scree_plot[1:10,], mapping=aes(x=component_number, y=variance))+geom_bar(stat="identity") | |
| suppressMessages(ggsave('DESeq2_PCA_scree.jpeg', device="jpeg")) | |
| pcadata <- plotPCA(rld, intgroup=c("condition"), returnData=TRUE) | |
| percentVar <- round(100 * attr(pcadata, "percentVar")) | |
| pca <- ggplot(pcadata, aes(PC1, PC2, color=condition, shape=condition)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) | |
| suppressMessages(ggsave('DESeq2_PCAplot_rld.jpeg', pca, device = "jpeg")) | |
| } | |
| proc_edger <- function(data) { | |
| counts = data[[1]] | |
| sampleInfo = data[[2]] | |
| batchbool = data[[3]] | |
| suppressMessages(library("edgeR")) | |
| conditions <- factor(sampleInfo$condition) | |
| gender <- factor(sampleInfo$gender) | |
| if (batchbool) { | |
| batch <- factor(sampleInfo$batch) | |
| design <- model.matrix(~batch+gender+conditions) | |
| } else { | |
| design <- model.matrix(~gender+conditions) | |
| } | |
| d <- DGEList(counts=data.matrix(counts), group=conditions) | |
| d$samples$group <- relevel(d$samples$group, ref="CON") | |
| dlim <- d[rowSums(cpm(d)>1) >= 2,] | |
| disp <- estimateDisp(calcNormFactors(dlim), design) #Common dispersion and tagwise dispersions in one run | |
| rownames(design) <- colnames(disp) | |
| fit <- glmFit(disp,design) #Likelihood ratio tests | |
| deg <- glmLRT(fit, coef=ncol(fit$design)) | |
| degTable <- topTags(deg, n=nrow(deg$counts))$table | |
| #MAplot | |
| jpeg('edgeR_MAplot.jpeg', width=8, height=8, units="in", res=500) | |
| plotSmear(deg, de.tags=rownames(disp)[as.logical(decideTestsDGE(deg))]) | |
| abline(h=c(-1, 1), col="blue") | |
| dev.off() | |
| #Dispersionplot | |
| jpeg('edgeR_dispersionPlot.jpeg', width=8, height=8, units="in", res=500) | |
| plotBCV(disp) | |
| dev.off() | |
| if (substr(rownames(degTable)[1], 1, 4) == 'ENSG'){ | |
| output <- ens2symbol(degTable)[,c('gene', 'logFC', 'PValue', 'FDR', 'symbol')] | |
| } else { | |
| outputtemp <- cbind(gene=row.names(degTable), as.data.frame(degTable)) | |
| outputtemp$symbol <- "NA" | |
| output <- outputtemp[,c('gene', 'logFC', 'PValue', 'FDR', 'symbol')] } | |
| colnames(output) <- c("gene", "logFC", "pvalue", "FDR", "symbol") | |
| DEG <- subset(output, FDR < 0.1) | |
| write.table(output, file="edgeR_differential_expression.txt", sep="\t", row.names=FALSE, quote=FALSE) | |
| write.table(as.data.frame(DEG$symbol), file="edgeR_DEG.txt", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) #Simple gene list with the purpose of doing enrichment analysis or plotting a Venn diagram | |
| cat(paste("Found", nrow(subset(DEG, logFC > 0)), "upregulated genes and", nrow(subset(DEG, logFC < 0)), "downregulated genes with edgeR.\n", collapse=" ")) | |
| makevolcanoplot(mutate(output, sig=ifelse(output$FDR<0.1, "FDR<0.1", "Not Sig")), "edgeR") | |
| } | |
| makevolcanoplot <- function(input, proc) { #Create volcanoplot of results, with labels and colors | |
| volc = ggplot(input, aes(logFC, -log10(pvalue))) + geom_point(aes(col=sig)) + scale_color_manual(values=c("black", "red")) + ggtitle(proc) | |
| if (substr(input$gene[1], 1, 4) == 'ENSG'){ | |
| volc+geom_text_repel(data=head(input, 20), aes(label=symbol)) #If ENSG is present, the results were converted to symbol notation earlier and can use these | |
| } else { | |
| volc+geom_text_repel(data=head(input, 20), aes(label=gene)) } | |
| suppressMessages(ggsave(paste(proc, "Volcanoplot.jpeg", sep="_"), device="jpeg")) | |
| } | |
| indata = sanityCheck(unlist(strsplit(commandArgs(trailingOnly = TRUE)," "))) #Function to validate the structure of the supplied datafiles and returned as list. | |
| suppressMessages(library("annotables")) | |
| suppressMessages(library("pheatmap")) | |
| suppressMessages(library("RColorBrewer")) | |
| suppressMessages(library("ggplot2")) | |
| suppressMessages(library("dplyr")) | |
| suppressMessages(library("genefilter")) | |
| suppressMessages(library("rgl")) | |
| suppressMessages(library("ggrepel")) | |
| suppressMessages(library("matrixStats")) | |
| for (func in c(proc_edger, proc_limma_voom, proc_deseq2)) {func(indata)} | |
| cat("\nFinished!\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment