#R Code for DataColada68 - Pilot dropping can backfire
#Written by Uri Simonsohn (urisohn@gmail.com)
#Please contact Uri directly if you see any errors or have any comments.
#
#
###############################################################################
#Outline
#1) Generate the data
#2) Run t-tests for each dataset for each observation between 10 and 100
#3) Plot false-positive for each sample size (should be 5% across the board)
#4) Function to simulate pilot-dropping - fun.pilot.dropping()
#5) Run some examples
#6) Function to generate plots in the post - plotter()
#7) Figure in the post
###############################################################################
#1 Generate entire simtot studies with n observations per cell each
set.seed(1999)
#How many simulations
simtot=1000 #Set to just 1000 here, but I run and saved a version with 100k that I rely on below
#How big is each published study
n=100
#Draw the data
x1=matrix(nrow=n,ncol=simtot,rnorm(simtot*n)) #simtot simulated studies, each with n=100, we take the first k as the "pilot"
x2=matrix(nrow=n,ncol=simtot,rnorm(simtot*n))
#2 Run t-test after every observations for each study, with n between 11 and 100
ps=matrix(nrow=n-10,ncol=simtot) #store results here
for (simk in 1:simtot) { #loop over simulations
for (nk in 11:n) { #loop over observations
ps[(nk-10),simk]=t.test(x1[1:nk,simk],x2[1:nk,simk])$p.val #t-test
}}
#Instead of rerunning each time, i was loading the ps[] results after having run it with simtot=100000
#setwd("c:/dropbox/datacolada/68 Dropping pilots")
#load("Simulated 100k studies.RData")
#3 Plot false-positive for each sample size (should be 5% across the board)
fp=c()
for (n in 1:90) fp=c(fp,mean(ps[n,]<.05))
plot(11:100,fp,ylim=c(0,.1),
main="verifying prob(p<.05)=.05 for every n",
xlab="How many observations are included?",ylab="prob(p<.05)")
abline(h=.05,col='red4')
#4 Function to compute effect of pilot-dropping
fun.pilot.dropping=function(n1,n2,p1,p2,quiet=F)
{
row=n1-10 #Row to lookup in the ps[] matrix with t-test results,
#subtract 10 becuase we only look at samples starting with n=11 so if n=11 that's the first row in the result table
p_final=ps[n2-10,(ps[row,]