Last active
August 29, 2015 14:07
-
-
Save Lakens/7f516efbcc47de9d40a0 to your computer and use it in GitHub Desktop.
PcurveTrueAndPhackedStudiesFig1_Fig2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# phack | |
# Requires | |
# {psych} | |
# See Also http://rynesherman.com/blog/phack-an-r-function-for-examining-the-effects-of-p-hacking/ | |
phack <- function(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="greater", graph=TRUE, sims=100000) { | |
require(psych) | |
outmat <- matrix(NA, nrow=sims, ncol=6) | |
for(i in 1:sims) { | |
grp1 <- rnorm(initialN, grp1M, grp1SD) | |
grp2 <- rnorm(initialN, grp2M, grp2SD) | |
initial.r <- cor(c(grp1,grp2), c(rep(1,length(grp1)),rep(0,length(grp2)))) | |
hackcount <- 0 | |
initialp <- t.test(grp1, grp2, alternative=alternative)$p.value | |
currp <- initialp | |
while(currp > alpha & maxN > length(grp1)) { | |
hackcount <- hackcount + 1 | |
grp1 <- c(grp1, rnorm(hackrate, grp1M, grp1SD)) | |
grp2 <- c(grp2, rnorm(hackrate, grp2M, grp2SD)) | |
currp <- t.test(grp1, grp2, alternative=alternative)$p.value | |
} | |
Nadded <- length(grp1) - initialN | |
est.r <- cor(c(grp1, grp2), c(rep(1, length(grp1)),rep(0, length(grp2)))) | |
outmat[i,] <- c(initialp, hackcount, currp, Nadded, initial.r, est.r) | |
} | |
outmat <- data.frame(outmat) | |
colnames(outmat) <- c("Initial.p", "Hackcount", "Final.p", "NAdded", "Initial.r", "Final.r") | |
InitialSigProb <- sum(outmat$Initial.p <= alpha) / sims | |
AvgHacks <- mean(outmat$Hackcount) | |
FinalSigProb <- sum(outmat$Final.p <= alpha) / sims | |
AvgNAdded <- mean(outmat$NAdded) | |
AvgTotN <- mean(initialN + outmat$NAdded) | |
StopProb <- sum((initialN + outmat$NAdded)==maxN) / sims | |
cat("Proportion of Original Samples Statistically Significant =", InitialSigProb, "\n") | |
cat("Proportion of Samples Statistically Significant After Hacking =", FinalSigProb, "\n") | |
cat("Probability of Stopping Before Reaching Significance =", StopProb, "\n") | |
cat("Average Number of Hacks Before Significant/Stopping =", AvgHacks, "\n") | |
cat("Average N Added Before Significant/Stopping =", AvgNAdded, "\n") | |
cat("Average Total N", AvgTotN, "\n") | |
cat("Estimated r without hacking", round(fisherz2r(mean(fisherz(outmat$Initial.r))),2), "\n") | |
cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r))),2), "\n") | |
cat("Estimated r with hacking", round(fisherz2r(mean(fisherz(outmat$Final.r[outmat$Final.p<alpha]))),2), "(non-significant results not included)", "\n") | |
if(graph==TRUE) { | |
op <- par(mfrow=c(2,1), las=1, font.main=1) | |
hist(outmat$Initial.p[outmat$Initial.p<alpha], xlab="p-values", main="P-curve for Initial Study", | |
sub=paste("Number of Significant Studies = ", InitialSigProb*sims, " (", InitialSigProb, ")", sep=""), col="cyan", freq=FALSE) | |
lines(density(outmat$Initial.p[outmat$Initial.p<alpha])) | |
hist(outmat$Final.p[outmat$Final.p<alpha], xlab="p-values", main="P-curve for p-Hacked Study", | |
sub=paste("Number of Significant Studies = ", FinalSigProb*sims, " (", FinalSigProb, ")", sep=""), col="cyan", freq=FALSE) | |
lines(density(outmat$Final.p[outmat$Final.p<alpha])) | |
} | |
return(outmat) | |
} | |
#P-hack 100000 studies | |
res <- phack(initialN=50, hackrate=10, grp1M=0, grp2M=0, grp1SD=1, grp2SD=1, maxN=100, alpha=.05, alternative="two.sided", graph=TRUE, sims=100000) | |
phack<-(res$Final.p) | |
phack<-phack[phack<0.05] | |
length(phack) | |
hist(phack, main="Collecting 50 participants,\nanalyze data after every participants\nuntil 100 participants are collected", xlab=("Observed p-value"), breaks =10) | |
#100000 studies with multiple testing | |
nSims <- 100000 #number of simulated experiments | |
pmult <-numeric(nSims) #set up empty container for all simulated p-values | |
for(i in 1:nSims){ #for each simulated experiment | |
x<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
y1<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
y2<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
y3<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
y4<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
y5<-rnorm(n = 23, mean = 100, sd = 20) #produce simulated participants | |
t<-t.test(x,y1) #perform the t-test | |
if(t$p.value<0.05) { | |
pmult[i]<-t$p.value #get the p-value and store it | |
} | |
if(t$p.value>0.05) { | |
t<-t.test(x,y2) #perform the second t-test | |
if(t$p.value<0.05) { | |
pmult[i]<-t$p.value #get the p-value and store it | |
} | |
} | |
if(t$p.value>0.05) { | |
t<-t.test(x,y3) #perform the third t-test | |
if(t$p.value<0.05) { | |
pmult[i]<-t$p.value #get the p-value and store it | |
} | |
} | |
if(t$p.value>0.05) { | |
t<-t.test(x,y4) #perform the third t-test | |
if(t$p.value<0.05) { | |
pmult[i]<-t$p.value #get the p-value and store it | |
} | |
} | |
if(t$p.value>0.05) { | |
t<-t.test(x,y5) #perform the fifth t-test | |
pmult[i]<-t$p.value #get the p-value (regardless of whether significant and store it | |
} | |
} | |
pmult<-pmult[pmult<0.05] | |
length(pmult) | |
hist(pmult, main="Testing exp. condition against 1/5 control conditions", xlab=("Observed p-value"), breaks =10) | |
#100000 normal studies 50% power | |
nSims <- 100000 #number of simulated experiments | |
p <-numeric(nSims) #set up empty container for all simulated p-values | |
for(i in 1:nSims){ #for each simulated experiment | |
x<-rnorm(n = 49, mean = 100, sd = 20) #produce simulated participants | |
y<-rnorm(n = 49, mean = 108, sd = 20) #produce simulated participants | |
t<-t.test(x,y) #perform the t-test | |
p[i]<-t$p.value #get the p-value and store it | |
} | |
p<-p[p<0.05] | |
#plots | |
hist(p, main="100000 studies with 50% power\nonly true effects", xlab=("Observed p-value"), breaks =10) | |
ptot<-c(p,phack, pmult) | |
hist(ptot, main="100000 studies with 50% power and\n200000 studies with p-hacking", xlab=("Observed p-value"), breaks =10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment