############################################## # # R Code to reproduce DataColada 63 "Manylab overestimated importance of hidden moderators" # Written by Uri Simonsohn # email: urisohn@gmail.com # pleaes email me directly if you see any errors or have any questions. # last update : 2017 10 13 # ################################################################################################ library(metafor) #For meta-analysis #Outline: #1 Load the data posted by the ManyLabs folks. # 2) FUNCTIONS # 2.1) Function 1 - metak() runs one meta-analysis # 2.2) Function 2 - meta.boot() Bootstrapping # 2.3) Function 3 - rhist() Make summary histogram included in post #3) RUN ANALYSES #4) Make the histogram charts ######################################################### #1 Load the data posted by the ManyLabs folks. #BIG FILE #a=read.csv("http://datacolada.org/wp-content/uploads/2017/10/CleanedDataset.csv") #I obtained this file by converting the CleanedDataset.sav file posted on https://osf.io/sjkxc/files/ within the Datasets.zip file #It is a file with 39Mb, containing many variables thata are not of interest. #So I kept the first 100 variables to make the file more manageble: #a=a[,1:100] #write.csv(a,"Smaller-ManyLabs-Cleaned-Dataset-file.csv") #SMALLER (created in lines above) a=read.csv("http://datacolada.org/wp-content/uploads/2017/10/Smaller-ManyLabs-Cleaned-Dataset-file.csv") #This file has one row per participant, the key columns for the code below #referrer: name of the lab running the study (!) #Ranch1: rank ordered answers to anchoring question #1 (rank was done across all labs combined, so rank=100 means this participant was the 100th most extreme across all 6000+ participants) #Ranch2-4: idem #anch1group: high (1) vs low (0 anchor #anch2.4group: idem ########################################### # 2) FUNCTIONS # 2.1) Function 1 - runs all t-tests and does a meta-analysis metak=function(x,group,lab) { i=0 #counter n1=n0=m1=m0=s1=s0=c() #vectors for results of n,m,sd for (labk in unique(lab)) #loop over all labs { i=i+1 #For each lab, get the means, sd and n for each condition m1[i]=mean(x[group==1 & lab==labk],na.rm=T) #Mean s1[i]=sd(x[group==1 & lab==labk],na.rm=T) #SD n1[i]=sum(!is.na(x[group==1 & lab==labk])) #Count non NA observations m0[i]=mean(x[group==0 & lab==labk],na.rm=T) #Mean s0[i]=sd(x[group==0 & lab==labk],na.rm=T) #SD n0[i]=sum(!is.na(x[group==0 & lab==labk])) #Count non NA observations } #run meta-analysis with the specification from Fred's posted code. meta1=rma(measure="SMD", m1i=m1, m2i=m0, sd1i=s1, sd2i=s0, n1i=n1, n2i=n0,method="HE") #note: adjusted from "ManyLabs_Heterogeneity.R", line 60, i use method="HE" because the default REML sometimes does not converge #Output res1=c(meta1$b,sqrt(meta1$tau2), meta1$I2, meta1$QEp) #Keep the point estimate, $b, $tau2, I2 and the p-value for heterogeneity round(res1,5) } # 2.2) Function 2 - Bootstrapping meta.boot=function(seed,simtot,x,group) { set.seed(seed) #Seed for reproducibility res=matrix(nrow=simtot,ncol=4) #Empty matrix to store results #Loop itself (for boot) for (simk in 1:simtot) { lab.boot=sample(lab) #Shuffle the lab column res[simk,]=metak(x,group,lab.boot) #Run the new meta-analysis if (simk%%100==0) cat("...",simk) #Counter } #Vectors with results d.boot=res[,1] tau.boot=res[,2] i2.boot=res[,3] p.boot=res[,4] #Histograms par(mfrow=c(2,2)) hist(d.boot,main="Effect size") hist(p.boot,main="p-value for heterogeneity under the null") hist(i2.boot,main="Distribution of I2 under-null") hist(tau.boot,main="tau") #output results cat("\n\n\nShare significant:",mean(p.boot<.05)) cat("\nMedian I2",median(i2.boot)) cat("\nMedian tau",median(tau.boot)) list(p=p.boot, i2=i2.boot,tau=tau.boot) } #2.3) Function 3 Function that plots results in nice-looking histogram (used for Figure 1 & 2 in the post) histr=function(r,title,subtitle) { rk=r #this is used to identify which set of results to plot, the output of the (2.3) function #Histogram hist(rk$i2,main="",xaxt="n",yaxt="n",ylab="", xlab=expression(paste("I "^"2"~": Estimated heterogeneity")), breaks=15, xlim=c(0,70), col='dodgerblue',ylim=c(0,simtot/2)) #title mtext(side=3,title,line=2.25,font=2,cex=1.75) mtext(side=3,subtitle,line=.25,font=3,cex=1.25) #Y axis mtext(side=2,line=2.5,"% of simulations",cex=1.5) marks=seq(from=0,to=.5,by=.1)*simtot axis(side=2,at=marks,paste0((marks/simtot)*100,"%")) #x axis marks=seq(from=0,to=100,by=10) axis(side=1,at=marks,paste0(marks,"%")) #vertical abline(v=median(rk$i2),col='brown') #Inside the graph legend with summary results txt1=paste0("Median I2 : ",round(median(rk$i2),1),"%") txt2=paste0("Share p<.05: ",mean(rk$p<.05)*100,"%") text(40,.3*simtot,paste0(txt1,"\n",txt2),col='brown',adj=0,cex=1) } ######################################### #3) RUN ANALYSES #Create vectore with lab names lab=a$referrer #Add variable that show no effect and no heterogeneity #3.1 Generate normally distributed data to validate approach #3.1.1.0 Add variable with no effect set.seed(100) y1=rnorm(length(a$anch1group)) #The dv is N(0,1) for all labs and both conditions #3.1.1.1 that has homogeneous effect y2=rnorm(length(a$anch1group)) y2=ifelse(a$anch1group==1,y2+.5,y2) #the dvs is N(.5,1) for treatment conditions, N(0,1) for control, in all labs #3.1.1.2 Add variable that has heterogenoues effect, each lab has an effect proportional to its name length y3=rnorm(length(a$anch1group)) y3=ifelse(a$anch1group==1,y3+nchar(as.character(lab))/8,y2) #so nchar(as.character(lab)) counts how long the name of the lab is, and the effect is proportional to ti #note: lab has the name of teh labs, on average it is 4 characters long with sd=.18 #so it is a handy way to create heterogeneity l=nchar(as.character(lab))/8 mean(l) sd(l) #3.1.1.3 Verify i did this right, means shoudl differ by .5 for y2 and y3 mean(y2[a$anch1group==1],na.rm=TRUE) mean(y2[a$anch1group==0],na.rm=TRUE) mean(y3[a$anch1group==1],na.rm=TRUE) mean(y3[a$anch1group==0],na.rm=TRUE) #3.2 Run the simulations simtot=1000 x=a$Ranch1 group=a$anch1group #Then the 4 anchoring variables r1=meta.boot(seed=1231,simtot=simtot,x=a$Ranch1,group=a$anch1group) #Anchoring 1 r2=meta.boot(seed=1232,simtot=simtot,x=a$Ranch2,group=a$anch2group) #Anchoring 2 r3=meta.boot(seed=1233,simtot=simtot,x=a$Ranch3,group=a$anch3group) #Anchoring 3 r4=meta.boot(seed=1234,simtot=simtot,x=a$Ranch4,group=a$anch4group) #Anchoring 4 #Note: #See variables codebook at the top of this file #Now the normally distributed variables r5=meta.boot(seed=1235,simtot=simtot,x=y1,group=a$anch1group) #Normal data, no effect r6=meta.boot(seed=1236,simtot=simtot,x=y2,group=a$anch1group) #Normal data, homogenous effect r7=meta.boot(seed=1237,simtot=simtot,x=y3,group=a$anch1group) #Normal data, heterogeneous effect #4 Make the histogram charts #Figure 1 png("anchoring 2017 10 13.png", width=1800,height=1400,res=200) par(mfrow=c(2,2)) histr(r1,"Anchoring 1","") histr(r2,"Anchoring 2","") histr(r3,"Anchoring 3","") histr(r4,"Anchoring 4","") dev.off() #Figure 2 png("normal 2017 10 13.png", width=1800,height=600,res=150) par(mfrow=c(1,3)) histr(r5,title="Normally distributed (1)",subtitle="(Before shuffling: all labs d=0)") histr(r6,title="Normally distributed (2)",subtitle="(Before shuffling: all labs d=.5)") histr(r7,title="Normally distributed (3)",subtitle="(Before shuffling: d=.5, sd(d)=.2)") dev.off() #FIgure X: not shown in main post, but linked to from within body: #Histogram with the raw data on anchoring (well, "raw", after clenaed and ranktsranfomed) png("histogram anchoring.png", width=1800,height=1400,res=200) par(mfrow=c(2,2)) hist(a$Ranch1,breaks=100,main="Anchoring 1", xlab="Rank Transformed DV",col='cadetblue1') hist(a$Ranch2,breaks=100,main="Anchoring 2", xlab="Rank Transformed DV",col='cadetblue1') hist(a$Ranch3,breaks=100,main="Anchoring 3", xlab="Rank Transformed DV",col='cadetblue1') hist(a$Ranch4,breaks=100,main="Anchoring 4", xlab="Rank Transformed DV",col='cadetblue1') dev.off()