Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active August 29, 2015 14:17
Show Gist options
  • Save crazyhottommy/f81d9b1f81a6adfa948c to your computer and use it in GitHub Desktop.
Save crazyhottommy/f81d9b1f81a6adfa948c to your computer and use it in GitHub Desktop.
### This part is from the Edx online Harvard course
## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1
library(devtools)
install_github("genomicsclass/GSE5859Subset")
library(GSE5859Subset)
data(GSE5859Subset)
dim(geneExpression)
head(geneExpression)
dim(sampleInfo)
head(sampleInfo)
sampleInfo$filename
head(geneAnnotation)
g<- sampleInfo$group
# let's look at one single gene
e<- geneExpression[25,]
# t-test, expression should be normal distribution
qqnorm(e[g==1])
qqline(e[g==1])
qqnorm(e[g==0])
qqline(e[g==1])
# perform t-test
t.test(e[g==1], e[g==0])
# do t-test for all the genes
mytest<- function(x) t.test(x[g==1], x[g==0], var.equal=T)$p.value
## or we can use the genefilter package from bioconductor
## library(genefilter)
## results<- rowttests(geneExpression, factor(g))
pvals<- apply(geneExpression, 1, mytest)
sum(pvals< 0.05) # how many pvalues are smaller than 0.05
# there are 1383 genes with p value smaller than 0.05
# are all of them statistically different?
hist(pvals)
## simulate muliple comparasions with random data #####
m<- nrow(geneExpression)
n<- ncol(geneExpression)
# generate random numbers
randomData<- matrix(rnorm(n*m), m, n)
nullpvalues<- apply(randomData, 1, mytest)
hist(nullpvalues)
## order the pvals computed above and plot it.
alpha = 0.05
m = length(pvals)
#m is the number of 8793 comparisons
plot(x=seq(1,100), y=pvals[order(pvals)][1:100])
abline(a=0, b=alpha/m)
title("slop is alpha/m")
# let's zoom in to look at the first 15 p values from small to big
plot(x=seq(1,100), y=pvals[order(pvals)][1:100], xlim=c(1,15))
abline(a=0, b=alpha/m)
title("slop is alpha/m")
# we can see that the 14th p value is bigger than its own threshold
# which is computed by (0.05/m) * 14 = 7.960878e-05
# we will use p.adjust function and the method "fdr" or "BH" to
# correct the p value, what the p.adjust function does to to
# recalculate the p-value. ?p.adjust to see more
# p(i)<= (i/m) * alpha
# p(i) * m/i <= alpha
# we can then only accept the returned if p.adjust(pvals) <= alpha
# number of p values smaller than their own thresholds after controlling FDR=0.05
sum( p.adjust(pvals, method="fdr") < 0.05 )
# 13, the same as we saw from the figure
# use the function qvalue from the qvalue bioconductor package
sum( qvalue(pvals)$qvalues < 0.05)
# 17, less conservative than the BH method
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment