Last active
August 29, 2015 14:06
-
-
Save fdabl/16cbe18e2d0a79b7baf9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('pwr') # for power calculations | |
library('foreign') # for dreadful spss .sav files | |
library('psych') # required by phack | |
source('http://rynesherman.com/phack.r') | |
SIM <- 10000 | |
set.seed(42) | |
options(warn=-1) # avoid warning by read.spss | |
############################################ | |
# Simulating p-hacking | |
############################################ | |
run_phack <- function() { | |
# call it later | |
res <- phack(initialN=30, hackrate=3, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=120, | |
alpha=0.05, alternative='two.sided', graph=TRUE, sims=3000) | |
par(mfrow=c(1,1)) # recover from phack plots | |
} | |
############################################ | |
# Distribution of p-values | |
############################################ | |
testit <- function(d) { | |
x <- rnorm(mean=0, sd=1, 100) | |
y <- rnorm(mean=d, sd=1, 100) | |
t.test(x, y)$p.value | |
} | |
simplot <- function(pvalues, effect=TRUE) { | |
h <- hist(pvalues) | |
h$counts <- h$counts / sum(h$counts) | |
title <- sprintf('Distribution of p-values given %s effect', ifelse(effect, 'an', 'no')) | |
plot(h, freq=FALSE, col='blue', main=title, xlab='p-values', ylab='percentage') | |
} | |
############################################ | |
# Relation between power and size of p-values | |
############################################ | |
power <- seq(0.1, 0.9, 0.1) | |
test <- function(power, d=0) { | |
n <- pwr.t.test(d=0.43, power=power, sig.level=0.05)$n | |
x <- rnorm(mean=0, sd=1, n) | |
y <- rnorm(mean=d, sd=1, n) | |
t.test(x, y)$p.value | |
} | |
pplot <- function(percent, power) { | |
barplot(percent, col='blue', xlab='power', ylab='p-values <= 0.02', names.arg=power, | |
main='Frequency of p-values <= 0.02 in p-values < 0.05 given H1') | |
} | |
########################################## | |
# P - Curve | |
# Logic of p-curve: | |
# Computes the p value of the p value, i.e. what is the probability of obtaining a p value | |
# as extreme or more as the one observed, given that the p values are uniformly distributed (H0). | |
# e.g. p = 0.01 => ppvalue of 0.2 | |
# generally: ppvalue = p/0.05 | |
# We use Fisher's method to get a chisq distribution for skew. | |
# | |
# Practical Example: | |
# p-curve analysis on a recent PLoS One paper demonstrating publication bias http://goo.gl/lxAkKx | |
################################# | |
# TODO: test against 33% Power [relies on non-central distributions] | |
fisher <- function(pvalues, right=TRUE) { | |
# test for right-skew per default | |
# -2 times the sum of the natural log of each of k | |
# uniform distributions is distributed X² with df = 2k | |
ppvalues <- sapply(pvalues, function(p) p/0.05) | |
if (!right) | |
ppvalues <- 1 - ppvalues | |
df <- 2 * length(ppvalues) | |
x <- -2 * sum(sapply(ppvalues, log)) | |
p <- 1 - pchisq(x, df) | |
return(list(p=p, df=df, value=x)) | |
} | |
curveplot <- function(pvalues) { | |
h <- hist(pvalues) | |
h$counts <- h$counts / sum(h$counts) # use percentages | |
title <- sprintf('Distribution of %i p-values', length(pvalues)) | |
plot(h, col='blue', xlab='p-values', ylab='Percentage', main=title) | |
} | |
############################################ | |
# #################### Main | |
############################################ | |
wait <- function(info) { | |
info <- sprintf('Press [enter] to %s \n', info) | |
readline(prompt=info) | |
} | |
# phacking | |
wait('simulate phacking') | |
cat('running simulations ... \n') | |
run_phack() | |
cat('\n') | |
# distribution | |
cat('running simulations (like, all the time) ... \n') | |
effect <- replicate(SIM, testit(0.4)) | |
noeffect <- replicate(SIM, testit(0)) | |
wait('see the distribution of p-values given no effect') | |
simplot(noeffect, effect=FALSE) | |
wait('see the distribution of p-values given an effect') | |
simplot(effect[effect <= 0.05]) | |
# relation power and p-values | |
wait('see the relation of power and p-values') | |
cat('needs more simulations (you know the drill) ... \n\n') | |
effect <- sapply(power, function(p) replicate(SIM / length(power), test(p, d=0.43))) | |
signp <- effect[effect <= 0.05] | |
smallp <- function(col) sum(col <= 0.02) / length(smallp) | |
percent <- apply(effect, 2, smallp) | |
pplot(percent, power) | |
# p-curve analysis | |
wait('see a p-curve analysis of 347 p-values') | |
data <- read.spss('plos.sav') | |
pvalues <- data$p | |
pvalues <- pvalues[!is.na(pvalues)] # remove NA | |
pvalues <- pvalues[pvalues <= 0.05] | |
f <- fisher(pvalues) | |
info <- sprintf('Heavily right-skewed, with Chi^2(%s) = %.2f, p = %.2f\n', f$df, f$value, f$p) | |
cat(info) | |
curveplot(pvalues) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment