alyssafrazee · August 29, 2015 14:04
diff --git a/bioc.md b/bioc.md
diff --git a/diffexpression.R b/diffexpression.R
 ## edgeR tutorial
 # http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html
 library('edgeR')
 rm(list=ls())
 setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine
 load('ds1.Rdata')  # stem cell data!
 setwd('-')

 colnames(counts) # first 2 column names give the cell type
 dim(counts) #30,727 by 27.

 grp = as.factor(substr(colnames(counts), 1, 2))
 table(grp)

 ## simple explorations
 o = order(grp)
 pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL)
 # expect that cells of same type will be most similar to each other

 # mds plots
 d = DGEList(counts=counts, group=grp)
 d = calcNormFactors(d)
 head(d$samples)
 cps = cpm(d)
 k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm
 d = d[k,]
 dim(d) #21,707 rows

 cols = as.numeric(d$samples$group)
 plotMDS(d, col=cols)

 par(mfrow=c(2,2))
 plotMDS(d, col=cols, main='500 / 1LFC')
 plotMDS(d, col=cols, method='bcv', main='500 / BCV')
 plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC')
 plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV')

 mm = model.matrix(~ -1 + grp)
 d = estimateGLMCommonDisp(d, mm)
 d = estimateGLMTrendedDisp(d, mm)
 d = estimateGLMTagwiseDisp(d, mm)
 par(mfrow=c(1,1))
 plotBCV(d) # plot biological coefficient of variation

 plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE) 
 # used less frequently than mean/dispersion plot

 # MA plot in lecture...

 ## fit model
 f = glmFit(d, mm)
 con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm))
 con

 lrt = glmLRT(f, contrast=con)
 topTags(lrt, 20)

 ## spot checks: plot expression for DE genes
 cps = cpm(d)
 o = order(colnames(counts)) 
 barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference


 ## same test without constructing contrasts (parameterization detail)
 grp = relevel(grp, ref="ES") # make ES the reference category
 mm1 = model.matrix(~grp) # model now has intercept
 f1 = glmFit(d, mm1)
 lrt1 = glmLRT(f1, coef=2)
 topTags(lrt1, 10)



 ## doing counting!
 library(parallel)
 detectCores() # this is a nice function

 # need a GTF file
 # BAM files: be careful about whether data is paired or not
 # also options to count multimapping, etc.

 ## NOT RUN
 library(Rsubread)
 anno = dir(, ".gtf$")
 anno
 f = dir(, ".bam$") # not on the instance, so this doesn't work
 f
 fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE)
 fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE)

 ##
 setwd('/data/RNA_SEQ_edgeR_voom_featureCounts')
 load('ds3.Rdata')
 head(gid)
 head(md) # the metadata
 dx = DGEList(ecounts, group=md$treatment)
 dx = calcNormFactors(dx)
 xmm = model.matrix(~libtype + treatment, data=md)
 plotMDS(dx)

 # differential splicing
 vx = voom(dx, xmm, plot=TRUE)
 fx = lmFit(vx, xmm) #heteroskedastic regression at exon level
 # TODO find out about this heteroskedastic business
 # I mean, it's not heteroskedastic variance within an exon, ya?
 ex = diffSplice(fx, geneid=gid, exonid=eid)
 topSplice(ex)


 par(mfrow=c(1,2))
 plotSplice(ex, geneid='FBgn0005630')
 plotSplice(ex, geneid='FBgn0034072') # this is sweet.
diff --git a/rbioc_for_everyone.R b/rbioc_for_everyone.R
 ## Martin Morgan's R/Bioconductor for everyone!
 ## BioC 2014, 7/31/2014

 source('http://bioconductor.org/biocLite.R')

 # topics discussed:
 # everything is a vector
 # R makes it easy to conceputalize things from the inside out
 # "tilde" == "as a function of"

 # exercise!
 library(pasillaBamSubset)
 library(GenomicAlignments)
 library(ShortRead)

 flag = scanBamFlag(isMinusStrand=FALSE)
 param = ScanBamParam(what=c('seq', 'qual'), flag=flag)

 fl = untreated1_chr4() #it knows, I'm running this on my EC2 Bioc instance
 res = readGAlignments(fl, param=param)
 tail(sort(table(cigar(res))))
 # 91810 reads aligned with cigar 75M, no insertions or deletions.
 # 337 reads had 52 Matches, a 101-bp gap, and 23 more Matches

 abc = alphabetByCycle(mcols(res)$seq) ###OMG useful
 # this is a matrix
 abc = abc[1:4,] # M, R, etc. are not relevant nucleotides for this example
 matplot(t(abc)) #ohh that's nice
 matplot(t(abc), type='l', lwd=2, lty=1)
 legend('topright', legend=rownames(abc), col=1:4, lty=1, lwd=2)

 # look at the data
 # understand what you see
 # act appropriately

 # more examples/exercises available at: 
 # http://www.bioconductor.org/help/course-materials/2014/BioC2014/RBiocForEveryone-lab.pdf
	## edgeR tutorial
	# http://www.bioconductor.org/help/course-materials/2014/BioC2014/BioC2014_edgeR_voom.html
	library('edgeR')
	rm(list=ls())
	setwd("/data/RNA_SEQ_edgeR_voom_featureCounts/") ## on the amazon machine
	load('ds1.Rdata') # stem cell data!
	setwd('-')

	colnames(counts) # first 2 column names give the cell type
	dim(counts) #30,727 by 27.

	grp = as.factor(substr(colnames(counts), 1, 2))
	table(grp)

	## simple explorations
	o = order(grp)
	pairs(log2(counts[,o[1:7]]), pch=".", lower.panel=NULL)
	# expect that cells of same type will be most similar to each other

	# mds plots
	d = DGEList(counts=counts, group=grp)
	d = calcNormFactors(d)
	head(d$samples)
	cps = cpm(d)
	k = rowSums(cps>=1)>2 #filter to genes with expression > 2 cpm
	d = d[k,]
	dim(d) #21,707 rows

	cols = as.numeric(d$samples$group)
	plotMDS(d, col=cols)

	par(mfrow=c(2,2))
	plotMDS(d, col=cols, main='500 / 1LFC')
	plotMDS(d, col=cols, method='bcv', main='500 / BCV')
	plotMDS(d, col=cols, top=2000, main=' 2000 / 1LFC')
	plotMDS(d, col=cols, top=2000, method='bcv', main=' 2000 / BCV')

	mm = model.matrix(~ -1 + grp)
	d = estimateGLMCommonDisp(d, mm)
	d = estimateGLMTrendedDisp(d, mm)
	d = estimateGLMTagwiseDisp(d, mm)
	par(mfrow=c(1,1))
	plotBCV(d) # plot biological coefficient of variation

	plotMeanVar(d, show.raw=TRUE, show.tagwise=TRUE, show.binned=TRUE)
	# used less frequently than mean/dispersion plot

	# MA plot in lecture...

	## fit model
	f = glmFit(d, mm)
	con = makeContrasts("DE-ES"=grpDE-grpES, levels=colnames(mm))
	con

	lrt = glmLRT(f, contrast=con)
	topTags(lrt, 20)

	## spot checks: plot expression for DE genes
	cps = cpm(d)
	o = order(colnames(counts))
	barplot(cps['ENSG00000095596',o], col=cols[o], las=2) #this is a huge difference


	## same test without constructing contrasts (parameterization detail)
	grp = relevel(grp, ref="ES") # make ES the reference category
	mm1 = model.matrix(~grp) # model now has intercept
	f1 = glmFit(d, mm1)
	lrt1 = glmLRT(f1, coef=2)
	topTags(lrt1, 10)



	## doing counting!
	library(parallel)
	detectCores() # this is a nice function

	# need a GTF file
	# BAM files: be careful about whether data is paired or not
	# also options to count multimapping, etc.

	## NOT RUN
	library(Rsubread)
	anno = dir(, ".gtf$")
	anno
	f = dir(, ".bam$") # not on the instance, so this doesn't work
	f
	fcs = featureCounts(f[2], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE)
	fcpaired = featureCounts(f[1], annot.ext=anno, nthreads=2, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE)

	##
	setwd('/data/RNA_SEQ_edgeR_voom_featureCounts')
	load('ds3.Rdata')
	head(gid)
	head(md) # the metadata
	dx = DGEList(ecounts, group=md$treatment)
	dx = calcNormFactors(dx)
	xmm = model.matrix(~libtype + treatment, data=md)
	plotMDS(dx)

	# differential splicing
	vx = voom(dx, xmm, plot=TRUE)
	fx = lmFit(vx, xmm) #heteroskedastic regression at exon level
	# TODO find out about this heteroskedastic business
	# I mean, it's not heteroskedastic variance within an exon, ya?
	ex = diffSplice(fx, geneid=gid, exonid=eid)
	topSplice(ex)


	par(mfrow=c(1,2))
	plotSplice(ex, geneid='FBgn0005630')
	plotSplice(ex, geneid='FBgn0034072') # this is sweet.
	## Martin Morgan's R/Bioconductor for everyone!
	## BioC 2014, 7/31/2014

	source('http://bioconductor.org/biocLite.R')

	# topics discussed:
	# everything is a vector
	# R makes it easy to conceputalize things from the inside out
	# "tilde" == "as a function of"

	# exercise!
	library(pasillaBamSubset)
	library(GenomicAlignments)
	library(ShortRead)

	flag = scanBamFlag(isMinusStrand=FALSE)
	param = ScanBamParam(what=c('seq', 'qual'), flag=flag)

	fl = untreated1_chr4() #it knows, I'm running this on my EC2 Bioc instance
	res = readGAlignments(fl, param=param)
	tail(sort(table(cigar(res))))
	# 91810 reads aligned with cigar 75M, no insertions or deletions.
	# 337 reads had 52 Matches, a 101-bp gap, and 23 more Matches

	abc = alphabetByCycle(mcols(res)$seq) ###OMG useful
	# this is a matrix
	abc = abc[1:4,] # M, R, etc. are not relevant nucleotides for this example
	matplot(t(abc)) #ohh that's nice
	matplot(t(abc), type='l', lwd=2, lty=1)
	legend('topright', legend=rownames(abc), col=1:4, lty=1, lwd=2)

	# look at the data
	# understand what you see
	# act appropriately

	# more examples/exercises available at:
	# http://www.bioconductor.org/help/course-materials/2014/BioC2014/RBiocForEveryone-lab.pdf