#DataColada 88 - Computing an approximate CI around the hot hand estimate of miller and sanjurjo

  rm(list = ls())
  library('rio')
  GVT.Study4=import("D:/Dropbox/Papers/6301-6400/6378 - Mller Sanjuanjo - Supplement-Data_Code_Statistics/0-RAWDATA/GilovichValloneTversky--CognitivePsychology--1985_CornellData.xls")
  #Spreadsheet original found in the supplement from the Econometrica paper, posted to http://datacolada.org/appendix/88
 
  
#number of players
   N.players=length(unique(GVT.Study4$sid))  #Number of players

#1. Function 1 - compute outcomes after 3 shots 
  compute.hot3=function(shots)
  {
    n=length(shots)          #hot many shots total
    shot.next=shots[4:n]     #take the last n-3 shots as the DV
    shot.p1=shots[3:(n-1)]   #predict with 1 lag 
    shot.p2=shots[2:(n-2)]    #2 lags
    shot.p3=shots[1:(n-3)]    #3 lag
    HHH=(shot.p1==shot.p2 & shot.p2==shot.p3 & shot.p1==1)  #1/0: converted after HHH
    MMM=(shot.p1==shot.p2 & shot.p2==shot.p3 & shot.p1==0)  #1/0: converted after MMM
    mean.HHH=mean(shot.next[HHH==T]) #Average conversion after HHH
    mean.MMM=mean(shot.next[MMM==T]) #Average conversion after MMM
    hot3=mean.HHH-mean.MMM
    count.HHH=sum(HHH)
    count.MMM=sum(MMM)
    c(n,mean(shots),mean.HHH,mean.MMM,count.HHH,count.MMM,hot3)
  }
  
#2. Example of Function 1: player 1 in Table II in Miller & Sanjurjo, has .5 and .44, with 12 and 9 such streaks, reproduced here
    playerk=subset(GVT.Study4,sid==101)
    compute.hot3(playerk$make) #  shots - phit  phit|HHH phit|MMM  hot.hand
    
    
#3 Function that computes Table II  in Miller & Sanjurjo
    compute.table.II=function(datak)
      {
      #Empty table first
        table.II=matrix(nrow=N.players,ncol=7)
      #Set of IDs to compute
        sid.all=unique(datak$sid)
      #Counter
        k=1
      #loop over ids
        for (sidk in unique(datak$sid))
          {
          playerk=subset(datak,sid==sidk)
          table.II[k,]=compute.hot3(playerk$make)
          k=k+1
        }
      #Output the table
        table.II=data.frame(table.II)
        names(table.II)=c("shots",'p','p.HHH','p.MMM','N.HHH','N.MMM','hothand.3')
        return(table.II)
    }
    
#4 Reproduce Table II with observed data
    #Table
      t2=compute.table.II(GVT.Study4)
      t2
    #Summary row
      round(colMeans(t2,na.rm=T),3)

#5 Bootstrap the table under the null
      df=GVT.Study4  #Copy the data
       
  #function to bootstrap under the null 
      compute.summary.row.boot=function(df,replace,btot)
      {
      #Make empty results table
        table.boot=matrix(nrow=btot, ncol=7)
      #Get all IDs
        sid.all=unique(GVT.Study4$sid)  #set of Player IDS
      #Bootstrap loop
        for (bk in 1:btot)
        {
        #Copy dataset
          df.null=df
        #Make data under null for each player 
          for (sidk in sid.all)
          {
          #Subset the player from real data
            playerk=subset(df,sid==sidk)
          #Shuffle their shots
            playerk$make.shuffle=sample(playerk$make,replace=replace)
          #Copy paste to null data
            df.null$make[df.null$sid==sidk]=playerk$make.shuffle
          } #End loop
          
      #Compute the table for the bootstrapped sample
        t2.boot=compute.table.II(df.null)
          if (bk%%100==0) cat("...",bk)

      #Save summary row
        table.boot[bk,]=colMeans(t2.boot,na.rm=T)
      }#End bootstrap loop
        return(table.boot)
      }#End function
      
      
  #Summary of bootstrapped results under null of P(H|HHH)=P(H) 
      set.seed(123)
      summary.row.boot=compute.summary.row.boot(df=GVT.Study4,replace=F,btot=1000)
      hh.boot=summary.row.boot[,7]
      mean(hh.boot)
      
    #Confidence interval: 0-19%
      q1=round(quantile(hh.boot,.975),3)
      q2=round(quantile(hh.boot,.025),3)

      cat(paste0("The boostrapped confidence interval under null of 0 is [",q1,",",q2,"]\n",
                 "In words, when P(H)=P(H|HHH), hot hand artifact leads P(H|HHH)-P(H|MMM)=-.09 with this CI around it",
                 "]\nwidth of ",q2-q1,"\nso estiamtes is observed +-", (q2-q1)/2))