Last active
August 29, 2015 14:17
-
-
Save crazyhottommy/f81d9b1f81a6adfa948c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### This part is from the Edx online Harvard course | |
## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1 | |
library(devtools) | |
install_github("genomicsclass/GSE5859Subset") | |
library(GSE5859Subset) | |
data(GSE5859Subset) | |
dim(geneExpression) | |
head(geneExpression) | |
dim(sampleInfo) | |
head(sampleInfo) | |
sampleInfo$filename | |
head(geneAnnotation) | |
g<- sampleInfo$group | |
# let's look at one single gene | |
e<- geneExpression[25,] | |
# t-test, expression should be normal distribution | |
qqnorm(e[g==1]) | |
qqline(e[g==1]) | |
qqnorm(e[g==0]) | |
qqline(e[g==1]) | |
# perform t-test | |
t.test(e[g==1], e[g==0]) | |
# do t-test for all the genes | |
mytest<- function(x) t.test(x[g==1], x[g==0], var.equal=T)$p.value | |
## or we can use the genefilter package from bioconductor | |
## library(genefilter) | |
## results<- rowttests(geneExpression, factor(g)) | |
pvals<- apply(geneExpression, 1, mytest) | |
sum(pvals< 0.05) # how many pvalues are smaller than 0.05 | |
# there are 1383 genes with p value smaller than 0.05 | |
# are all of them statistically different? | |
hist(pvals) | |
## simulate muliple comparasions with random data ##### | |
m<- nrow(geneExpression) | |
n<- ncol(geneExpression) | |
# generate random numbers | |
randomData<- matrix(rnorm(n*m), m, n) | |
nullpvalues<- apply(randomData, 1, mytest) | |
hist(nullpvalues) | |
## order the pvals computed above and plot it. | |
alpha = 0.05 | |
m = length(pvals) | |
#m is the number of 8793 comparisons | |
plot(x=seq(1,100), y=pvals[order(pvals)][1:100]) | |
abline(a=0, b=alpha/m) | |
title("slop is alpha/m") | |
# let's zoom in to look at the first 15 p values from small to big | |
plot(x=seq(1,100), y=pvals[order(pvals)][1:100], xlim=c(1,15)) | |
abline(a=0, b=alpha/m) | |
title("slop is alpha/m") | |
# we can see that the 14th p value is bigger than its own threshold | |
# which is computed by (0.05/m) * 14 = 7.960878e-05 | |
# we will use p.adjust function and the method "fdr" or "BH" to | |
# correct the p value, what the p.adjust function does to to | |
# recalculate the p-value. ?p.adjust to see more | |
# p(i)<= (i/m) * alpha | |
# p(i) * m/i <= alpha | |
# we can then only accept the returned if p.adjust(pvals) <= alpha | |
# number of p values smaller than their own thresholds after controlling FDR=0.05 | |
sum( p.adjust(pvals, method="fdr") < 0.05 ) | |
# 13, the same as we saw from the figure | |
# use the function qvalue from the qvalue bioconductor package | |
sum( qvalue(pvals)$qvalues < 0.05) | |
# 17, less conservative than the BH method | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment