|
## edgeR tutorial |
|
# http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html |
|
library('edgeR') |
|
rm(list=ls()) |
|
setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine |
|
load('ds1.Rdata') # stem cell data! |
|
setwd('-') |
|
|
|
colnames(counts) # first 2 column names give the cell type |
|
dim(counts) #30,727 by 27. |
|
|
|
grp = as.factor(substr(colnames(counts), 1, 2)) |
|
table(grp) |
|
|
|
## simple explorations |
|
o = order(grp) |
|
pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL) |
|
# expect that cells of same type will be most similar to each other |
|
|
|
# mds plots |
|
d = DGEList(counts=counts, group=grp) |
|
d = calcNormFactors(d) |
|
head(d$samples) |
|
cps = cpm(d) |
|
k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm |
|
d = d[k,] |
|
dim(d) #21,707 rows |
|
|
|
cols = as.numeric(d$samples$group) |
|
plotMDS(d, col=cols) |
|
|
|
par(mfrow=c(2,2)) |
|
plotMDS(d, col=cols, main='500 / 1LFC') |
|
plotMDS(d, col=cols, method='bcv', main='500 / BCV') |
|
plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC') |
|
plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV') |
|
|
|
mm = model.matrix(~ -1 + grp) |
|
d = estimateGLMCommonDisp(d, mm) |
|
d = estimateGLMTrendedDisp(d, mm) |
|
d = estimateGLMTagwiseDisp(d, mm) |
|
par(mfrow=c(1,1)) |
|
plotBCV(d) # plot biological coefficient of variation |
|
|
|
plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE) |
|
# used less frequently than mean/dispersion plot |
|
|
|
# MA plot in lecture... |
|
|
|
## fit model |
|
f = glmFit(d, mm) |
|
con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm)) |
|
con |
|
|
|
lrt = glmLRT(f, contrast=con) |
|
topTags(lrt, 20) |
|
|
|
## spot checks: plot expression for DE genes |
|
cps = cpm(d) |
|
o = order(colnames(counts)) |
|
barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference |
|
|
|
|
|
## same test without constructing contrasts (parameterization detail) |
|
grp = relevel(grp, ref="ES") # make ES the reference category |
|
mm1 = model.matrix(~grp) # model now has intercept |
|
f1 = glmFit(d, mm1) |
|
lrt1 = glmLRT(f1, coef=2) |
|
topTags(lrt1, 10) |
|
|
|
|
|
|
|
## doing counting! |
|
library(parallel) |
|
detectCores() # this is a nice function |
|
|
|
# need a GTF file |
|
# BAM files: be careful about whether data is paired or not |
|
# also options to count multimapping, etc. |
|
|
|
## NOT RUN |
|
library(Rsubread) |
|
anno = dir(, ".gtf$") |
|
anno |
|
f = dir(, ".bam$") # not on the instance, so this doesn't work |
|
f |
|
fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE) |
|
fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE) |
|
|
|
## |
|
setwd('/data/RNA_SEQ_edgeR_voom_featureCounts') |
|
load('ds3.Rdata') |
|
head(gid) |
|
head(md) # the metadata |
|
dx = DGEList(ecounts, group=md$treatment) |
|
dx = calcNormFactors(dx) |
|
xmm = model.matrix(~libtype + treatment, data=md) |
|
plotMDS(dx) |
|
|
|
# differential splicing |
|
vx = voom(dx, xmm, plot=TRUE) |
|
fx = lmFit(vx, xmm) #heteroskedastic regression at exon level |
|
# TODO find out about this heteroskedastic business |
|
# I mean, it's not heteroskedastic variance within an exon, ya? |
|
ex = diffSplice(fx, geneid=gid, exonid=eid) |
|
topSplice(ex) |
|
|
|
|
|
par(mfrow=c(1,2)) |
|
plotSplice(ex, geneid='FBgn0005630') |
|
plotSplice(ex, geneid='FBgn0034072') # this is sweet. |