###############################################################################
#R programs that evaluate the distributions of coins size estimation
#  reported in Judgment Decision Making journal article by Wang et al (2012)
#
#Written by Uri Simonsohn, August, 2013
###############################################################################

#OUTLINE OF THE R PROGRAMS BELOW
#I first create two functions.
#
#FUnction 1/ c_table(y1,y2) 
#   it takes two vectors, y1 and y2, and generates a contingency table. This is needed
#   becuase R's chi.test() program does not deal with vectors of data, but with frequencies directly. 
#   So for example, if y1=c(5,5) and y2=c(5,8) then a contingency table 
#            5 8
#            ===
#    y1      2 0
#    y2      1 1
#   is generated 
#
#  (the returned value/table is just the frequencies, with no headings: 
# so that c_table(y1,y2)=2,0
#                        1,1
#
#Funtion 2. bootchi(simtot,y1,y2)
#    This function does the bootstrapping, it takes vectors y1 and y2, concatenates them
#    and draws with replacement simtot samples from it, apply the c_table() function from above
#    to the bootstrapped sample and saves into a vector chi() the resulting chi-values
#



################################################################

#FUNCTION 1. CREATES CONTINGENCY TABLE FOR CHI2 TEST FROM TWO VECTORS OF DATA
 c_table=function(y1,y2) {
   #Find max and min for y to see the range of values to consider
      maxy=max(y1,y2)
      miny=min(y1,y2)
    #Create columns from every possible value, all integers between min and max
      y1f=factor(y1, levels=miny:maxy)    
      y2f=factor(y2,levels =miny:maxy)
   #combine then into a table
      m=matrix(c(table(y1f),table(y2f)),nrow=2,ncol=(maxy-miny+1),byrow=TRUE)
    #drop columns from table with only 0s as they make the chi.test() program crash.
      m=m[,colSums(m^2) !=0]  #got this code from http://stackoverflow.com/questions/6632018/delete-all-columns-with-0-from-matrix
      return(m)                
    }
 
 
 #FUNCTION 2. BOOTSTRAP CHI2 TEST
 bootchi=function(simtot,y1,y2) {
                   #SIMTOT IS THE NUMBER OF SIMULATIONS (200K BELOW)
   chi=c()        #initialize vector with set of results
   yb=c()         #initialize bootstrap samples vector
   #pool data
   y=c(y1,y2)
   for (i in 1:simtot) {
        
     #draw random sample
     yb=sample(y,size=length(y),replace=TRUE) 
     #split the data into two experiments
       y1b=yb[1:length(y1)]               #takes the first n1 elements, calls for Exp1, the rest for Exp2
       y2b=yb[(length(y1)+1):length(y)]
     #compute the chi-square on the bootstrapped sample
       chi[i]=chisq.test(c_table(y1b,y2b))$statistic
     }
     return(chi)
  
 }
  ##########################################################################

#COIN SIZE AND SHAME
  #Shame
   shame1=c(2,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7)
   shame2=c(2,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,6,7)
  #Control
  control1=c(2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,6,6,6)
  control2=c(2,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,5,6,6,6)
 

  #x2s that's observed (.0565)
    chi_obs1=chisq.test(c_table(shame1,shame2))$statistic       #x2 for the shame condition 
    chi_obs2=chisq.test(c_table(control1,control2))$statistic   #x2 for the control condition
    chi_obs=chi_obs1+chi_obs2                                   #summing both x2s
    #note: this code generates the warning that the chi-square approximation may be incorrect.
    #      that is not a problem since we do not really  use the x2 value as a chi-square.


#RESULTS

     set.seed(126)                                          #set seed to get same results every time
      simtot=200000                                         #200k simulations 
      time1=Sys.time()                                      #to time how long it takes to execute
      chi_r1=bootchi(simtot=simtot,y1=shame1,y2=shame2)     #bootstrap the shame 
      chi_r2=bootchi(simtot=simtot,y1=control1,y2=control2) #now the control
      chir_r=chi_r1+chi_r2
      print(Sys.time()-time1)                               #print how much time it took
      
      cat("\n p for shame: ",sum(chi_r1<=chi_obs1)/simtot)  #count # of simulations <=observed, turn into p-value
      cat("\n p for control: ",sum(chi_r2<=chi_obs2)/simtot)
      cat("\n p for both: ",sum(chi_r<=chi_obs)/simtot)
      

    #Save results for graphing in Excel
      chi_obs
      write(sort(round(chi_r,1)),"output.txt",sep="\n")
      
      
  
 
#note: the chi.test() function in R has a built it montecarlo but it has two prolems.
# the most important is that it reports the p-value for a more extreme difference including the observed
# we want a less extreme one also including the observed, so we cannot use the reported p-value for this purposes
# for instance, for the shame condition, the observed x2 is the lowest possible, so the p-value=1, this would mean
# it is impossible to et such similar results, but this excludes the probability of exactly teh same
# results.
# another problem is that the montecarlo samples without replacement (a la Fisher's exact test).
# i simulated that also and leads to very similar results.