crazyhottommy · August 29, 2015 14:17
diff --git a/Pvalue_FDR_multiple_test.r b/Pvalue_FDR_multiple_test.r
 ### This part is from the Edx online Harvard course 
 ## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1

 library(devtools)
 install_github("genomicsclass/GSE5859Subset")

 library(GSE5859Subset)
 data(GSE5859Subset)
 dim(geneExpression)

 head(geneExpression)

 dim(sampleInfo)
 head(sampleInfo)
 sampleInfo$filename

 head(geneAnnotation)
 g<- sampleInfo$group

 # let's look at one single gene
 e<- geneExpression[25,]

 # t-test, expression should be normal distribution
 qqnorm(e[g==1])
 qqline(e[g==1])

 qqnorm(e[g==0])
 qqline(e[g==1])

 # perform t-test
 t.test(e[g==1], e[g==0])

 # do t-test for all the genes
 mytest<- function(x) t.test(x[g==1], x[g==0], var.equal=T)$p.value

 ## or we can use the genefilter package from bioconductor
 ## library(genefilter)
 ## results<- rowttests(geneExpression, factor(g))

 pvals<- apply(geneExpression, 1, mytest)

 sum(pvals< 0.05)  # how many pvalues are smaller than 0.05

 # there are 1383 genes with p value smaller than 0.05
 # are all of them statistically different?
 hist(pvals)

 ## simulate muliple comparasions with random data #####

 m<- nrow(geneExpression)
 n<- ncol(geneExpression)

 # generate random numbers
 randomData<- matrix(rnorm(n*m), m, n)
 nullpvalues<- apply(randomData, 1, mytest)
 hist(nullpvalues)

 ## order the pvals computed above and plot it.
 alpha = 0.05
 m = length(pvals)
 #m is the number of 8793 comparisons 

 plot(x=seq(1,100), y=pvals[order(pvals)][1:100])
 abline(a=0, b=alpha/m)
 title("slop is alpha/m")
 # let's zoom in to look at the first 15 p values from small to big

 plot(x=seq(1,100), y=pvals[order(pvals)][1:100], xlim=c(1,15))
 abline(a=0, b=alpha/m)
 title("slop is alpha/m")
 # we can see that the 14th p value is bigger than its own threshold 
 # which is computed by (0.05/m) * 14 = 7.960878e-05

 # we will use p.adjust function and the method "fdr" or "BH" to
 # correct the p value, what the p.adjust function does to to
 # recalculate the p-value. ?p.adjust to see more
 # p(i)<= (i/m) * alpha 
 # p(i) * m/i <= alpha
 # we can then only accept the returned if p.adjust(pvals) <= alpha
 # number of p values smaller than their own thresholds after controlling FDR=0.05

 sum( p.adjust(pvals, method="fdr") < 0.05 )
 # 13, the same as we saw from the figure

 # use the function qvalue from the qvalue bioconductor package
 sum( qvalue(pvals)$qvalues < 0.05)
 # 17, less conservative than the BH method
	### This part is from the Edx online Harvard course
	## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1

	library(devtools)
	install_github("genomicsclass/GSE5859Subset")

	library(GSE5859Subset)
	data(GSE5859Subset)
	dim(geneExpression)

	head(geneExpression)

	dim(sampleInfo)
	head(sampleInfo)
	sampleInfo$filename

	head(geneAnnotation)
	g<- sampleInfo$group

	# let's look at one single gene
	e<- geneExpression[25,]

	# t-test, expression should be normal distribution
	qqnorm(e[g==1])
	qqline(e[g==1])

	qqnorm(e[g==0])
	qqline(e[g==1])

	# perform t-test
	t.test(e[g==1], e[g==0])

	# do t-test for all the genes
	mytest<- function(x) t.test(x[g==1], x[g==0], var.equal=T)$p.value

	## or we can use the genefilter package from bioconductor
	## library(genefilter)
	## results<- rowttests(geneExpression, factor(g))

	pvals<- apply(geneExpression, 1, mytest)

	sum(pvals< 0.05) # how many pvalues are smaller than 0.05

	# there are 1383 genes with p value smaller than 0.05
	# are all of them statistically different?
	hist(pvals)

	## simulate muliple comparasions with random data #####

	m<- nrow(geneExpression)
	n<- ncol(geneExpression)

	# generate random numbers
	randomData<- matrix(rnorm(n*m), m, n)
	nullpvalues<- apply(randomData, 1, mytest)
	hist(nullpvalues)

	## order the pvals computed above and plot it.
	alpha = 0.05
	m = length(pvals)
	#m is the number of 8793 comparisons

	plot(x=seq(1,100), y=pvals[order(pvals)][1:100])
	abline(a=0, b=alpha/m)
	title("slop is alpha/m")
	# let's zoom in to look at the first 15 p values from small to big

	plot(x=seq(1,100), y=pvals[order(pvals)][1:100], xlim=c(1,15))
	abline(a=0, b=alpha/m)
	title("slop is alpha/m")
	# we can see that the 14th p value is bigger than its own threshold
	# which is computed by (0.05/m) * 14 = 7.960878e-05

	# we will use p.adjust function and the method "fdr" or "BH" to
	# correct the p value, what the p.adjust function does to to
	# recalculate the p-value. ?p.adjust to see more
	# p(i)<= (i/m) * alpha
	# p(i) * m/i <= alpha
	# we can then only accept the returned if p.adjust(pvals) <= alpha
	# number of p values smaller than their own thresholds after controlling FDR=0.05

	sum( p.adjust(pvals, method="fdr") < 0.05 )
	# 13, the same as we saw from the figure

	# use the function qvalue from the qvalue bioconductor package
	sum( qvalue(pvals)$qvalues < 0.05)
	# 17, less conservative than the BH method