Last active
August 12, 2018 14:21
-
-
Save dinovski/7e7eba1d390e35f87b97b48c2cbd34ba to your computer and use it in GitHub Desktop.
differential expression analysis markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "__RNAseq Summary__" | |
output: pdf_document | |
date: '`r Sys.Date()`' | |
documentclass: article | |
classoption: a4paper | |
geometry: margin=2cm | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = FALSE, cache = FALSE, message=FALSE, warning=FALSE) | |
``` | |
## *Differential Expression Analysis* | |
Genes with CPM > 0.5 in at least 2 samples were kept for differential analysis. This threshold was chosen based on the average log2CPM per gene to separate expressed genes from unexpressed genes. Raw count data was normalized with the TMM method and transformed to log2-CPM. A linear model was fit to the normalized data and empirical Bayes statistics were computed. Differentially expressed genes for each KO versus WT were identified from the linear fit after adjusting for multiple testing and filtered to include those with FDR < 0.05 and absolute logFC > 1. | |
R dependencies can be found at the end. | |
\pagebreak | |
```{r, echo=FALSE, message=FALSE} | |
require(rmarkdown) | |
require(edgeR) | |
require(limma) | |
require(Glimma) | |
require(ggplot2) | |
require(RColorBrewer) | |
require(gridExtra) | |
require(rtracklayer) | |
require(plyr) | |
require(dplyr) | |
require(reshape2) | |
require(DESeq2) | |
require(gplots) | |
require(ggrepel) | |
require(knitr) | |
require(xtable) | |
require(FactoMineR) | |
require(GenomicFeatures) | |
require(scater) | |
``` | |
```{r, echo=FALSE, message=FALSE, warning=FALSE} | |
inPath='' | |
outPath='' | |
geneBedPath='gencode.v19.annotation.clean.bed' | |
gtfPath='gencode.v19.annotation.gtf' | |
statsPath='' ## output from mapStats.sh | |
``` | |
```{r, echo=FALSE, results='hide', message=FALSE} | |
## load annotation and count files | |
geneBed<-read.table(geneBedPath, header=FALSE, sep="\t") | |
colnames(geneBed)<-c("chr","start","end","gene_id","gene_name") | |
## COUNT FILES: STAR | |
exprs.in <- list.files(path=inPath, pattern=".tab", full.names=TRUE, recursive=TRUE) | |
## apply(countFile[,2:4], 2, sum) | |
exprs.all <- lapply(exprs.in, function(file) { | |
countFile <- read.table(file, skip=4, header=FALSE, sep="\t") | |
countFile <- countFile[, c(1,4)] | |
name <- gsub(".R1_norRNAReadsPerGene.out.tab", "", basename(file)) | |
colnames(countFile) <- c("gene_id", name) | |
return(countFile) | |
}) | |
countTable <- Reduce(function(x,y) merge(x,y, all=TRUE, by="gene_id"), exprs.all) | |
ge<-countTable | |
rownames(ge)<-ge$gene_id | |
ge$gene_id<-NULL | |
``` | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=8, fig.width=8, eval=TRUE} | |
## load stat files | |
stats.in<-list.files(path=statsPath, pattern=".txt", full.names=TRUE, recursive=FALSE) | |
options(scipen=999) | |
stats.all <- lapply(stats.in, function(file) { | |
inFile <- read.table(file, header=TRUE, sep=":", colClasses=c("character","numeric")) | |
f.name <- gsub("_mapStats.txt", "", basename(file)) | |
return(inFile) | |
}) | |
statsTable <- Reduce(function(x,y) merge(x,y, all=TRUE, by="ID"), stats.all) | |
rownames(statsTable)<-gsub(" ", "_", statsTable$ID) | |
statsTable$ID<-NULL | |
stats<-data.frame(t(as.matrix(statsTable))) | |
stats$CLONE<-rownames(stats) | |
options(scipen=0) | |
``` | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=8, fig.width=8} | |
dups<-stats[,c("CLONE","DUP_RATE")] | |
dups$DUP_RATE=dups$DUP_RATE*100 | |
dups<-dups[order(dups$DUP_RATE, decreasing=T), c(1,2)] | |
dp<-ggplot(data=dups, aes(x=CLONE, y=DUP_RATE)) + ylim(0,100) + | |
geom_bar(stat="identity", fill="steelblue4") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="white"), | |
axis.text=element_text(size=12), | |
axis.text.x=element_text(face="bold", size=10, angle=270), | |
axis.title.x=element_text(size=12), | |
axis.title.y=element_text(size=12), | |
panel.border=element_rect(colour="white", fill=NA, size=1)) | |
dp | |
``` | |
\pagebreak | |
## *Mapping Stats* | |
```{r,echo=FALSE, fig.width=8, fig.height=8, fig.align='center', eval=TRUE} | |
## bam file; use input (ie. post trimming) | |
stats.s<-stats[order(stats$CLONE, decreasing=T), c("CLONE", "TRIMMED", "PRIMARY_BAM", "UNIQUE_BAM")] | |
colnames(stats.s)<-c("CLONE", "TOTAL" , "MAPPED", "UNIQUE") | |
#stats.s<-stats[order(stats$CLONE, decreasing=T), c("CLONE", "STAR_INPUT", "STAR_UNIQUE_NUM")] | |
#colnames(stats.s)<-c("CLONE", "TOTAL" , "UNIQUE") | |
dat.m <- melt(stats.s, id.vars="CLONE") | |
## overlay | |
## set position="dodge" to plot side by side | |
p1<-ggplot(dat.m, aes(x=CLONE,y=value, fill=variable)) + | |
geom_bar(stat="identity", position = "identity", alpha=1, width=0.9) + ylab("read count") + | |
scale_fill_manual("variable", values = c(TOTAL="firebrick", MAPPED="steelblue4", UNIQUE= "lightblue3")) + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="white"), | |
axis.text=element_text(size=12), | |
axis.text.x=element_text(face="bold", size=10, angle=270), | |
axis.title.x=element_text(size=12), | |
axis.title.y=element_text(size=12), | |
panel.border=element_rect(colour="white", fill=NA, size=1)) | |
p1 | |
``` | |
\pagebreak | |
```{r,echo=FALSE, fig.width=8, fig.height=8, fig.align='center', eval=TRUE} | |
## percentages | |
stats.p<-stats[,c("CLONE", "TRIMMED" ,"PRIMARY_BAM", "UNIQUE_BAM")] | |
stats.p$MAPPED=stats.p$PRIMARY_BAM/stats.p$TRIMMED * 100 | |
stats.p$UNIQUE=stats.p$UNIQUE_BAM/stats.p$TRIMMED * 100 | |
stats.p$TOTAL=100 | |
stats.p<-stats.p[order(stats.p$TOTAL, decreasing=T), c("CLONE", "TOTAL", "MAPPED", "UNIQUE")] | |
dat.m <- melt(stats.p, id.vars="CLONE") | |
p2<-ggplot(dat.m, aes(x=CLONE, y=value, fill=variable)) + | |
geom_bar(stat="identity", position = "identity", alpha=1, width=0.9) + ylab("% reads") + | |
scale_fill_manual("variable", values = c(TOTAL="firebrick", MAPPED="steelblue4", UNIQUE= "lightblue3")) + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="white"), | |
axis.text=element_text(size=12), | |
axis.text.x=element_text(face="bold", size=10, angle=270), | |
axis.title.x=element_text(size=12), | |
axis.title.y=element_text(size=12), | |
panel.border=element_rect(colour="white", fill=NA, size=1)) | |
p2 | |
``` | |
\pagebreak | |
## *gene based saturation* | |
```{r, echo=FALSE, echo=F, message=F, fig.width=8, fig.height=5, fig.align='center', eval=TRUE} | |
## gene based saturation: downsample reads and quantify num. genes with logCPM>0 | |
## Estimate saturation of genes based on rarefaction of reads | |
## counts : a matrix of counts | |
## max_reads : maximum number of reads to downsample | |
## ndepths : resampling levels | |
## nreps : number of times the subsampling is performed to calculate a variance | |
## mincounts : minimum counts level to consider a gene as expressed. If NA, the threshold is fixed to 1 CPM | |
## extend.lines : If TRUE, the max number of detected genes is returned when the maximum number of reads is reached. | |
estimate_saturation <- function(counts, max_reads=Inf, ndepths=6, nreps=1, mincounts=NA, extend.lines=FALSE){ | |
stopifnot(require(S4Vectors)) | |
counts <- as.matrix(counts) | |
readsums <- colSums(counts) | |
max_reads <- min(max(readsums), max_reads) | |
depths <- round(seq(from=0, to=max_reads, length.out=ndepths+1)) | |
saturation <- mclapply(1:ncol(counts), function(k){ | |
message("Processing sample ", colnames(counts)[k], "...") | |
x <- counts[,k] | |
nreads <- sum(x) | |
## minimum expression levels | |
if (is.na(mincounts)){ | |
mincounts <- nreads / 1e6 | |
} | |
## max number of detected genes | |
ngenes_detected <- length(which(x>=mincounts)) | |
probs <- x / nreads ## calculate gene probabilities for the library | |
probs <- probs[probs > 0] ## zero counts add nothing but computational time! | |
ngenes <- length(probs) | |
res <- lapply(depths, function(dp, nreps=1, ...){ | |
rsim <- c(NA, NA) | |
if (extend.lines) | |
rsim <- c(ngenes_detected, NA) | |
if (dp <= nreads){ | |
estim <- lapply(1:nreps, function(i, ngenes, dp, probs, mincounts){ | |
csim <- sample(x=ngenes, size=dp, replace=TRUE, prob=probs) | |
length(which(runLength(Rle(sort(csim)))>=mincounts)) | |
}, ngenes=ngenes, dp=dp, probs=probs, mincounts=mincounts) | |
rsim <- c(mean(unlist(estim)), var(unlist(estim))) | |
} | |
return(rsim) | |
}, nreps=nreps, nreads=nreads, ngenes=ngenes, probs=probs, mincounts=mincounts) | |
data.frame(depths=depths, | |
sat.estimates=sapply(res, "[[", 1), | |
sat.var.estimates=sapply(res, "[[", 2)) | |
}) | |
names(saturation) <- colnames(counts) | |
return(saturation) | |
} | |
sat <- estimate_saturation(counts=ge, ndepths=10, nreps=1, extend.lines=TRUE) | |
d2p <- melt(sat, id.vars=c("depths", "sat.estimates", "sat.var.estimates")) | |
selcol<-colorRampPalette(brewer.pal(9, "Set1"))(length(sat)) | |
gg <- ggplot(d2p) + geom_point(aes(x=depths, y=sat.estimates), color="darkgray", shape=3, size=2) + geom_path(aes(x=depths, y=sat.estimates, color=L1)) + | |
theme_classic() + theme(legend.position="bottom", legend.title=element_blank()) + | |
labs(x="Sequencing depth", y="Number of detected genes (>=1 CPM)") + theme(axis.title = element_text(face="bold", colour="black", size=12)) + | |
scale_color_manual(values=selcol, guide=guide_legend(nrow=ceiling(length(sat)/6))) | |
gg | |
``` | |
```{r preseq, echo=FALSE, eval=FALSE} | |
preseq.in <- list.files(inPath, pattern=".preseq", recursive=TRUE, full.names=TRUE) | |
if (length(preseq.in) > 0){ | |
preseq.info <- lapply(preseq.in, read.table, header=TRUE) | |
names(preseq.info) <- bioids[basename(sub("preseq","",dirname(preseq.in)))] | |
d2p <- melt(preseq.info, id.vars=c("total_reads","distinct_reads")) | |
selcol <- colorRampPalette(brewer.pal(9, "Set1"))(length(preseq.info)) | |
gg <- ggplot(d2p) + geom_point(aes(x=total_reads, y=distinct_reads), color="darkgray", shape=3, size=2) + geom_path(aes(x=total_reads, y=distinct_reads, color=L1)) + | |
theme_classic() + theme(legend.position="bottom", legend.title=element_blank()) + | |
labs(x="Total Reads Number", y="Distinct Reads Number") + theme(axis.title = element_text(face="bold", colour="black", size=12)) + | |
scale_color_manual(values=selcol, guide=guide_legend(nrow=ceiling(length(preseq.info)/6))) | |
gg | |
#ggly_b <- plotly_build(gg) | |
#ggly_b <- inactivate_hover(ggly_b) | |
#ggly_b <- plotly_margins(ggly_b) | |
#ggly_b | |
}else{ | |
warning("Preseq output files not found !") | |
} | |
``` | |
```{r, echo=FALSE, results='hide', message=FALSE, eval=FALSE} | |
## TPM from raw counts: scale by feature length then by scale factor: [count/len]/[sum(count/len)/10^6] | |
## sum of all TPMs in each sample are the same, ie. easier to compare samples | |
# ## get gene sizes based on exons > exons per gene id>reduce to non overlapping exons, sum lengths | |
# txdb <- makeTxDbFromGFF(gtfPath, format="gtf") | |
# exons.per.gene <- exonsBy(txdb, by="gene") | |
# exonic.gene.sizes <- sum(width(reduce(exons.per.gene))) | |
# exonic.gene.sizes <- exonic.gene.sizes[rownames(ge)] | |
# | |
# ge.sce<-newSCESet(countData=ge) | |
# ge.tpm<-calculateTPM(ge.sce, calc_from="counts", effective_length=exonic.gene.sizes) | |
# ge.tpm<-round(ge.tpm, 2) | |
# | |
# ge.tpm<-ge.tpm[apply(ge.tpm, MARGIN=1, function(x) { any(x>0) }),] | |
# | |
# df.tpm<-as.data.frame(ge.tpm) | |
# df.tpm$gene_id<-rownames(df.tpm) | |
# | |
# ge.tpm.merge<-merge(geneBed, df.tpm, by="gene_id", all.x=T) | |
# write.table(ge.tpm.merge, file=paste0(outPath,"prc2integrative_tpm.txt"), quote=F, row.names=T, sep="\t") | |
## manual TPM calculation | |
## Calculate reads per kilobase | |
#rpk <- ge * 10^3/matrix(exonic.gene.sizes, nrow=nrow(ge), ncol=ncol(ge), byrow=FALSE) | |
## normalize by lib size | |
#tpm <- rpk * matrix(10^6 / colSums(rpk), nrow=nrow(rpk), ncol=ncol(rpk), byrow=TRUE) | |
#tpm<-round(tpm,2) | |
``` | |
## *CPM filter* | |
```{r,echo=FALSE, fig.width=8, fig.height=4, eval=TRUE} | |
x<-ge | |
cpm <- edgeR::cpm(x) | |
lcpm <- edgeR::cpm(x, log=TRUE) | |
## raw data | |
nsamples <- ncol(x) | |
col <- brewer.pal(nsamples, "Paired") | |
par(mfrow=c(1,2)) | |
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, | |
main="", xlab="") | |
title(main="Raw data", xlab="Log-cpm") | |
abline(v=0, lty=3) | |
for (i in 2:nsamples){ | |
den <- density(lcpm[,i]) | |
lines(den$x, den$y, col=col[i], lwd=2) | |
} | |
#legend("topright", samplenames, text.col=col, bty="n") | |
## filter data | |
keep.exprs <- rowSums(cpm>1)>=2 | |
x <- x[keep.exprs,] | |
lcpm <- edgeR::cpm(x, log=TRUE) | |
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, | |
main="", xlab="") | |
title(main="Filtered data", xlab="Log-cpm") | |
abline(v=0, lty=3) | |
for (i in 2:nsamples){ | |
den <- density(lcpm[,i]) | |
lines(den$x, den$y, col=col[i], lwd=2) | |
} | |
#legend("topright", samplenames, text.col=col, bty="n") | |
``` | |
\pagebreak | |
```{r, echo=FALSE, results='hide', message=FALSE} | |
## filter lowly expressed genes | |
#ge<-ge[apply(ge, MARGIN=1, function(x) { any(x>0) }),] | |
ge.cpm <- edgeR::cpm(ge, log=FALSE) | |
## check threshold: compute av. log2CPM per gene | |
a.cpm<-edgeR::aveLogCPM(ge) | |
#hist(a.cpm, xlab="CPM") | |
#abline(v=log(0)) | |
keep.exprs <- rowSums(ge.cpm > 0.5)>=2 | |
ge <- ge[keep.exprs,] | |
## experiment details | |
names.tmp<-gsub("-1", "", colnames(ge)) | |
names.tmp<-gsub("-2", "", names.tmp) | |
condition=dput(names.tmp) | |
exp.all <- data.frame(samples=colnames(ge), | |
condition=condition) | |
## rlog | |
condition<-factor(condition) | |
dds <- DESeqDataSetFromMatrix(ge, DataFrame(condition), ~condition) | |
ge.rlog <- rlog(dds, blind=FALSE) | |
ge.rlog <- assay(ge.rlog) | |
``` | |
## *Hierarchical clustering* | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, results="asis"} | |
plot(hclust(as.dist(1 - cor(ge.rlog, method = "pearson")), method="ward.D2"), label=exp.all$condition) | |
``` | |
## *PCA by sample* | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, results="hide"} | |
resPCA <- PCA(t(ge.rlog), scale.unit=FALSE, graph=FALSE) | |
label <- dput(rownames(resPCA$ind$coord)) | |
ggplot(as.data.frame(resPCA$ind$coord), aes(x=resPCA$ind$coord[,1],y=resPCA$ind$coord[,2], label=label)) + | |
geom_point() + xlab(paste("PC1 ", round(resPCA$eig[1,2]),"% of variance", sep="")) + | |
ylab(paste("PC2 ",round(resPCA$eig[2,2]),"% of variance",sep="")) + geom_text(vjust=1.5) + | |
geom_hline(yintercept=0, linetype="dashed") + | |
geom_vline(xintercept=0, linetype="dashed") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
``` | |
\pagebreak | |
## *Differential Expression* | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
## fit model to individual covariate | |
exp.sel<-c("WT", "Ring1AB", "Ring1AB_BAP1") | |
## limma+voom: input raw data: apply log2CPM norm with voom | |
exp <- exp.all[exp.all$condition %in% exp.sel,] | |
ge.filt <- ge[colnames(ge) %in% exp$samples] | |
dge<-DGEList(counts=ge.filt[,colnames(ge.filt)], genes=rownames(ge.filt), group=exp$condition) | |
## design matrix for experimental conditions | |
cond <- factor(exp$condition) | |
model <- model.matrix(~0+cond) | |
colnames(model) <- levels(cond) | |
rownames(model) <- exp$samples | |
## TMM Normalization | |
dge <- calcNormFactors(dge, method="TMM") | |
## limma+voom: convert counts to log2(CPM) to estimate mean-variance relationship | |
v <- voom(dge, model, plot=FALSE) | |
vfit <- lmFit(v, model) | |
## output TMM norm log2CPM | |
## run all contrasts at once, then specify column of model in topTreat | |
contr.matrix <- makeContrasts( | |
Ring1ABvWT = Ring1AB - WT, | |
Ring1AB_BAP1vRing1AB = Ring1AB_BAP1 - Ring1AB, | |
Ring1AB_BAP1vWT = Ring1AB_BAP1 - WT, | |
levels = colnames(model)) | |
## compute coeffs and standard errors for contrast | |
vfit.c <- contrasts.fit(vfit, contrasts=contr.matrix) | |
efit <- eBayes(vfit.c) | |
## sig DE genes | |
dt<-decideTests(efit, adjust.method = "BH", p.value = 0.05) | |
dt.1<-dt | |
##### | |
## Top DE | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=1) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=1) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[1] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p + geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
``` | |
\pagebreak | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
## Top DE | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=2) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=2) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[2] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p + geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
#+ geom_text_repel(data=head(df.fit.sig, 1), aes(label=gene_id)) | |
``` | |
\pagebreak | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
## Top DE | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=3) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=3) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[3] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p + geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
``` | |
\pagebreak | |
## *DEA 2* | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
exp.sel<-c("WT", "Ring1A", "Ring1B", "Ring1AB") | |
## limma+voom: input raw data: apply log2CPM norm with voom | |
exp <- exp.all[exp.all$condition %in% exp.sel,] | |
ge.filt <- ge[colnames(ge) %in% exp$samples] | |
dge<-DGEList(counts=ge.filt[,colnames(ge.filt)], genes=rownames(ge.filt), group=exp$condition) | |
## design matrix for experimental conditions | |
cond <- factor(exp$condition) | |
model <- model.matrix(~0+cond) | |
colnames(model) <- levels(cond) | |
rownames(model) <- exp$samples | |
## TMM Normalization | |
dge <- calcNormFactors(dge, method="TMM") | |
## limma+voom: convert counts to log2(CPM) to estimate mean-variance relationship | |
v <- voom(dge, model, plot=FALSE) | |
vfit <- lmFit(v, model) | |
## run all contrasts at once, then specify contrast | |
contr.matrix <- makeContrasts( | |
Ring1AvRing1B = Ring1A - Ring1B, | |
Ring1AvRing1AB = Ring1A - Ring1AB, | |
Ring1BvRing1AB = Ring1B - Ring1AB, | |
Ring1AvWT = Ring1A - WT, | |
Ring1BvWT = Ring1B - WT, | |
Ring1ABvWT = Ring1AB - WT, | |
levels = colnames(model)) | |
## compute coeffs and standard errors for contrast | |
vfit.c <- contrasts.fit(vfit, contrasts=contr.matrix) | |
efit <- eBayes(vfit.c) | |
## sig DE genes: contrast 1 | |
dt<-decideTests(efit, adjust.method = "BH", p.value = 0.05) | |
dt.2<-dt | |
## Top DE | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=1) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=1) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[1] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p+geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
``` | |
\pagebreak | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=2) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=2) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[2] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p + geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
``` | |
\pagebreak | |
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.height=5, fig.width=6} | |
tt <- topTreat(efit, n=Inf, adjust.method="BH", p.value=0.05, coef=3) | |
## MD plot | |
tt.all <- topTreat(efit, n=Inf, adjust.method="BH", coef=3) | |
#hist(tt.all$P.Value) | |
df.fit<-data.frame(tt.all$genes, tt.all$logFC, tt.all$P.Value, tt.all$adj.P.Val, tt.all$AveExpr) | |
names(df.fit)<-c("gene_id", "logFC", "pval", "padj", "AveExpr") | |
## add gene names | |
df.names<-merge(geneBed, df.fit, by="gene_id", all.y=TRUE) | |
## output top-ranked genes from lmfit | |
contrast<-colnames(contr.matrix)[3] | |
write.table(df.names, file=paste0(outPath, contrast, ".txt"), quote=F, row.names=F, sep="\t") | |
df.fit.sig <- plyr::mutate(df.fit, sig=ifelse(tt.all$adj.P.Val<0.05, "sig", "not")) | |
p<-ggplot() | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="not",], aes(AveExpr,logFC,color=sig), size=1) | |
p<-p+geom_point(data=df.fit.sig[df.fit.sig$sig=="sig",], aes(AveExpr,logFC,color=sig), size=1) | |
## avg expression (log2CPM) v. fold change (logFC) | |
md<-p + geom_hline(yintercept=0, linetype="dashed") + | |
scale_color_manual(values=c("grey30", "firebrick"), guide=FALSE) + | |
ggtitle(contrast) + | |
#coord_cartesian(xlim=c(-5,13), ylim=c(-10,10)) + | |
xlab("avg logCPM") + ylab("logFC") + | |
theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), | |
panel.background=element_blank(), axis.line=element_line(colour="black"), | |
panel.border=element_rect(colour="black", fill=NA, size=1), | |
plot.title = element_text(size=18, face = "bold"), | |
axis.text=element_text(size=16, family="sans", colour="black"), | |
axis.title.x=element_text(size=16, family="sans", colour="black"), | |
axis.title.y=element_text(size=16, family="sans", colour="black")) | |
md | |
``` | |
\pagebreak | |
```{r session, results="asis"} | |
si<-sessionInfo() | |
print(si) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment