explodecomputer · June 30, 2014 10:53
diff --git a/permutation.R b/permutation.R
 m <- data.frame(matrix(rnorm(1000*20), 1000, 20))

 y = rnorm(1000)

 performScan <- function(y, dat)
 {
 	n <- ncol(dat)
 	res <- array(0, n)
 	for(i in 1:n)
 	{
 		n[i] <- anova(lm(y ~ dat[,i]))$P[1]
 	}
 	return(n)
 }

 permutations <- function(y, dat, nperm)
 {
 	res <- array(0, nperm)
 	for(i in 1:nperm)
 	{
 		cat(i, "\n")

 		# Randomly shuffle the y vector, but don't change dat
 		# This means that y1 is no longer related to dat
 		# but dat still has the same correlation structure
 		y1 <- sample(y, length(y), replace=FALSE)

 		# Perform the full analysis of y1 against all dat variables
 		p <- performScan(y1, dat)

 		# Just keep the smallest p value - 
 		# this p value should have no biological relevance because y is not related to dat
 		# But it will give some indication as to how much correlation structure there is in dat
 		res[i] <- min(p)
 	}

 	# Sort the saved p values
 	res <- sort(res, decreasing=FALSE)

 	# The 0.05 family wise error rate is the 5%th most extreme pvalue
 	threshold <- res[round(0.05*nperm)]
 	return(list(pvals=res, threshold=threshold))
 }


 perms <- permutations(y, dat, 100)


 # Distribution of p values that you would expect by chance if the null hypotheses are true
 hist(-log10(perms$pvals))

 # this is the threshold
 perms$threshold
	m <- data.frame(matrix(rnorm(1000*20), 1000, 20))

	y = rnorm(1000)

	performScan <- function(y, dat)
	{
	n <- ncol(dat)
	res <- array(0, n)
	for(i in 1:n)
	{
	n[i] <- anova(lm(y ~ dat[,i]))$P[1]
	}
	return(n)
	}

	permutations <- function(y, dat, nperm)
	{
	res <- array(0, nperm)
	for(i in 1:nperm)
	{
	cat(i, "\n")

	# Randomly shuffle the y vector, but don't change dat
	# This means that y1 is no longer related to dat
	# but dat still has the same correlation structure
	y1 <- sample(y, length(y), replace=FALSE)

	# Perform the full analysis of y1 against all dat variables
	p <- performScan(y1, dat)

	# Just keep the smallest p value -
	# this p value should have no biological relevance because y is not related to dat
	# But it will give some indication as to how much correlation structure there is in dat
	res[i] <- min(p)
	}

	# Sort the saved p values
	res <- sort(res, decreasing=FALSE)

	# The 0.05 family wise error rate is the 5%th most extreme pvalue
	threshold <- res[round(0.05*nperm)]
	return(list(pvals=res, threshold=threshold))
	}


	perms <- permutations(y, dat, 100)


	# Distribution of p values that you would expect by chance if the null hypotheses are true
	hist(-log10(perms$pvals))

	# this is the threshold
	perms$threshold