fdabl · August 29, 2015 14:06
diff --git a/pcurve.R b/pcurve.R
 library('pwr') # for power calculations
 library('foreign') # for dreadful spss .sav files
 library('psych') # required by phack
 source('http://rynesherman.com/phack.r')

 SIM <- 10000
 set.seed(42)
 options(warn=-1) # avoid warning by read.spss
 ############################################
 # Simulating p-hacking
 ############################################

 run_phack <- function() {
    # call it later
    res <- phack(initialN=30, hackrate=3, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=120,
                 alpha=0.05, alternative='two.sided', graph=TRUE, sims=3000)
    par(mfrow=c(1,1)) # recover from phack plots
 }


 ############################################
 # Distribution of p-values
 ############################################

 testit <- function(d) {
    x <- rnorm(mean=0, sd=1, 100)
    y <- rnorm(mean=d, sd=1, 100)
    t.test(x, y)$p.value
 }

 simplot <- function(pvalues, effect=TRUE) {
    h <- hist(pvalues)
    h$counts <- h$counts / sum(h$counts)
    title <- sprintf('Distribution of p-values given %s effect', ifelse(effect, 'an', 'no'))
    plot(h, freq=FALSE, col='blue', main=title, xlab='p-values', ylab='percentage')
 }


 ############################################
 # Relation between power and size of p-values
 ############################################
 power <- seq(0.1, 0.9, 0.1)

 test <- function(power, d=0) {
    n <- pwr.t.test(d=0.43, power=power, sig.level=0.05)$n
    x <- rnorm(mean=0, sd=1, n)
    y <- rnorm(mean=d, sd=1, n)
    t.test(x, y)$p.value
 }

 pplot <- function(percent, power) {
    barplot(percent, col='blue', xlab='power', ylab='p-values <= 0.02', names.arg=power,
            main='Frequency of p-values <= 0.02 in p-values < 0.05 given H1')
 }

 ##########################################
 # P - Curve
 # Logic of p-curve:
 # Computes the p value of the p value, i.e. what is the probability of obtaining a p value
 # as extreme or more as the one observed, given that the p values are uniformly distributed (H0).

 # e.g. p = 0.01 => ppvalue of 0.2
 # generally: ppvalue = p/0.05
 # We use Fisher's method to get a chisq distribution for skew.
 #
 # Practical Example:
 # p-curve analysis on a recent PLoS One paper demonstrating publication bias http://goo.gl/lxAkKx
 #################################

 # TODO: test against 33% Power [relies on non-central distributions]

 fisher <- function(pvalues, right=TRUE) {
    # test for right-skew per default
    # -2 times the sum of the natural log of each of k
    # uniform distributions is distributed X² with df = 2k
    ppvalues <- sapply(pvalues, function(p) p/0.05)

    if (!right)
        ppvalues <- 1 - ppvalues

    df <- 2 * length(ppvalues)
    x <- -2 * sum(sapply(ppvalues, log))
    p <- 1 - pchisq(x, df)
    return(list(p=p, df=df, value=x))
 }


 curveplot <- function(pvalues) {
    h <- hist(pvalues)
    h$counts <- h$counts / sum(h$counts) # use percentages
    title <- sprintf('Distribution of %i p-values', length(pvalues))
    plot(h, col='blue', xlab='p-values', ylab='Percentage', main=title)
 }


 ############################################
 # ####################    Main 
 ############################################
 wait <- function(info) {
    info <- sprintf('Press [enter] to %s \n', info)
    readline(prompt=info)
 }

 # phacking
 wait('simulate phacking')
 cat('running simulations ... \n')
 run_phack()
 cat('\n')


 # distribution
 cat('running simulations (like, all the time) ... \n')
 effect <- replicate(SIM, testit(0.4))
 noeffect <- replicate(SIM, testit(0))

 wait('see the distribution of p-values given no effect')
 simplot(noeffect, effect=FALSE)

 wait('see the distribution of p-values given an effect')
 simplot(effect[effect <= 0.05])


 # relation power and p-values
 wait('see the relation of power and p-values')
 cat('needs more simulations (you know the drill) ... \n\n')

 effect <- sapply(power, function(p) replicate(SIM / length(power), test(p, d=0.43)))
 signp <- effect[effect <= 0.05]
 smallp <- function(col) sum(col <= 0.02) / length(smallp)
 percent <- apply(effect, 2, smallp)
 pplot(percent, power)


 # p-curve analysis
 wait('see a p-curve analysis of 347 p-values')

 data <- read.spss('plos.sav')
 pvalues <- data$p
 pvalues <- pvalues[!is.na(pvalues)] # remove NA
 pvalues <- pvalues[pvalues <= 0.05]

 f <- fisher(pvalues)
 info <- sprintf('Heavily right-skewed, with Chi^2(%s) = %.2f, p = %.2f\n', f$df, f$value, f$p)
 cat(info)
 curveplot(pvalues)
	library('pwr') # for power calculations
	library('foreign') # for dreadful spss .sav files
	library('psych') # required by phack
	source('http://rynesherman.com/phack.r')

	SIM <- 10000
	set.seed(42)
	options(warn=-1) # avoid warning by read.spss
	############################################
	# Simulating p-hacking
	############################################

	run_phack <- function() {
	# call it later
	res <- phack(initialN=30, hackrate=3, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=120,
	alpha=0.05, alternative='two.sided', graph=TRUE, sims=3000)
	par(mfrow=c(1,1)) # recover from phack plots
	}


	############################################
	# Distribution of p-values
	############################################

	testit <- function(d) {
	x <- rnorm(mean=0, sd=1, 100)
	y <- rnorm(mean=d, sd=1, 100)
	t.test(x, y)$p.value
	}

	simplot <- function(pvalues, effect=TRUE) {
	h <- hist(pvalues)
	h$counts <- h$counts / sum(h$counts)
	title <- sprintf('Distribution of p-values given %s effect', ifelse(effect, 'an', 'no'))
	plot(h, freq=FALSE, col='blue', main=title, xlab='p-values', ylab='percentage')
	}


	############################################
	# Relation between power and size of p-values
	############################################
	power <- seq(0.1, 0.9, 0.1)

	test <- function(power, d=0) {
	n <- pwr.t.test(d=0.43, power=power, sig.level=0.05)$n
	x <- rnorm(mean=0, sd=1, n)
	y <- rnorm(mean=d, sd=1, n)
	t.test(x, y)$p.value
	}

	pplot <- function(percent, power) {
	barplot(percent, col='blue', xlab='power', ylab='p-values <= 0.02', names.arg=power,
	main='Frequency of p-values <= 0.02 in p-values < 0.05 given H1')
	}

	##########################################
	# P - Curve
	# Logic of p-curve:
	# Computes the p value of the p value, i.e. what is the probability of obtaining a p value
	# as extreme or more as the one observed, given that the p values are uniformly distributed (H0).

	# e.g. p = 0.01 => ppvalue of 0.2
	# generally: ppvalue = p/0.05
	# We use Fisher's method to get a chisq distribution for skew.
	#
	# Practical Example:
	# p-curve analysis on a recent PLoS One paper demonstrating publication bias http://goo.gl/lxAkKx
	#################################

	# TODO: test against 33% Power [relies on non-central distributions]

	fisher <- function(pvalues, right=TRUE) {
	# test for right-skew per default
	# -2 times the sum of the natural log of each of k
	# uniform distributions is distributed X² with df = 2k
	ppvalues <- sapply(pvalues, function(p) p/0.05)

	if (!right)
	ppvalues <- 1 - ppvalues

	df <- 2 * length(ppvalues)
	x <- -2 * sum(sapply(ppvalues, log))
	p <- 1 - pchisq(x, df)
	return(list(p=p, df=df, value=x))
	}


	curveplot <- function(pvalues) {
	h <- hist(pvalues)
	h$counts <- h$counts / sum(h$counts) # use percentages
	title <- sprintf('Distribution of %i p-values', length(pvalues))
	plot(h, col='blue', xlab='p-values', ylab='Percentage', main=title)
	}


	############################################
	# #################### Main
	############################################
	wait <- function(info) {
	info <- sprintf('Press [enter] to %s \n', info)
	readline(prompt=info)
	}

	# phacking
	wait('simulate phacking')
	cat('running simulations ... \n')
	run_phack()
	cat('\n')


	# distribution
	cat('running simulations (like, all the time) ... \n')
	effect <- replicate(SIM, testit(0.4))
	noeffect <- replicate(SIM, testit(0))

	wait('see the distribution of p-values given no effect')
	simplot(noeffect, effect=FALSE)

	wait('see the distribution of p-values given an effect')
	simplot(effect[effect <= 0.05])


	# relation power and p-values
	wait('see the relation of power and p-values')
	cat('needs more simulations (you know the drill) ... \n\n')

	effect <- sapply(power, function(p) replicate(SIM / length(power), test(p, d=0.43)))
	signp <- effect[effect <= 0.05]
	smallp <- function(col) sum(col <= 0.02) / length(smallp)
	percent <- apply(effect, 2, smallp)
	pplot(percent, power)


	# p-curve analysis
	wait('see a p-curve analysis of 347 p-values')

	data <- read.spss('plos.sav')
	pvalues <- data$p
	pvalues <- pvalues[!is.na(pvalues)] # remove NA
	pvalues <- pvalues[pvalues <= 0.05]

	f <- fisher(pvalues)
	info <- sprintf('Heavily right-skewed, with Chi^2(%s) = %.2f, p = %.2f\n', f$df, f$value, f$p)
	cat(info)
	curveplot(pvalues)