# The following represents code for analyzing contingency tables from different
# cases of fixed and random marginals.


# Fisher's Exact Test for 2 X 5 
# Optimal for designs with fixed marginals
# David C. Howell 9/13/11
# Data from Tea-Tasting example 
# This can be changed to read from a file, but the matrix is small enough
# that it is easier to enter it directly. 
# See "?fisher.test" for more information. 

Bristol <- matrix(c(3,1,1,3), nrow = 2, dimnames = list(Row
= c("1","2"), Col = c("1","2"))) 
# Remember that the "matrix" command reads down columns 
cat("The data matrix","\n")
print(Bristol) 
cat("\n")
cat("Fisher's exact test","\n") 
ft <- fisher.test(Bristol, alternative = "greater", conf.int = TRUE) 
#For a two-tailed test chanage to alternative = "two-sided" 
#Two.sided and conf.int apply only to 2x2 
print(ft)
cat("Pearson's (two-sided) chi-square test without Yates' correction","\n") 
pchisq <- chisq.test(Bristol, correct = FALSE)
print(pchisq) 
cat("Pearson's (two-sided) chi-square test with Yates' correction","\n") 
pchisq <- chisq.test(Bristol, correct = TRUE)
print(pchisq) 
cat("Notice that the last two tests are two-sided while  Fisher's test", "\n","is set up as one-sided.","\n")  
cat("Error messages refer to having small expected frequencies. \n")
------------------------------------------------------------------------


#  Row Marginals Fixed

# Analysis of contingency tables conditional on the row totals

#read in the data

origdata <- matrix(c(35,9,60,41), nrow = 2, byrow = TRUE)   #Civil Union example
  # Alternative data sets follow --You must set the default directory
  # origdata <- read.table("AlteredVote.dat", header = TRUE, nrows = 2)
  # origdata <- read.table("Visintainer.dat", nrows = 2, header = TRUE)  #2 x 3 example

result <- chisq.test(origdata, correct = F)
obtchisq <- result$statistic
dfs <- result$parameter
p <- result$p.value
N <- sum(origdata)
cat("The original data","\n")
print(origdata)
cat("\n")
print(result)
rowmarg <- rowSums(origdata)
nrow <- length(rowmarg)
colmarg <- colSums(origdata)
ncol <- length(colmarg)
colprob <- colmarg/N

# Now start the resampling 
nreps <- 10000
chisquare <- numeric(nreps)
results <- numeric(nreps)
extreme <- 0
# Get cumulative proportions over columns
   # Assume that we had a 3 x 2 design.
   # We might find that 60% of the obs are in col1, 25% in col2, and 15% in col3
   # The draw as many numbers as the row marginal, and assign to col based on
   # those probabilities. That does not mean that we will always get 60, 25, and
   # 15%, because assignment depends on whether the random number exceeds some value.
cum <- numeric()
cum[1] <- colprob[1]
for (j in 2:ncol) {cum[j] <- cum[j-1] + colprob[j]}

# Now resample nreps = 10,000 times 
count <- c()
countx <- c()
for (i in 1:nreps) {

randomtable <- c()
   for (j in 1:nrow) {
       randnum <- runif(rowmarg[j],0,1) 
       for (j in 1:ncol) {count[j] <- length(randnum[randnum <= cum[j]])}
  countx[1] <- count[1]
  for (k in 2:ncol) {countx[k] <- count[k] - count[k-1]} 
  randomtable <- rbind(randomtable, countx)
  }
  chisquare[i] <- chisq.test(randomtable, correct = FALSE)$statistic

  }
signif <- length(chisquare[chisquare >= obtchisq]) /nreps

par(mfrow=c(2,2))
hist(chisquare, breaks = 50, main = "Distribution of chi-square under null")
library(car)
qq.plot(chisquare, dist = "chisq", df = dfs)
cat("Pearson's chi-square statistic is ",obtchisq,"\n")
cat("That chi-square has a p value of ",p, "\n")
cat("\n")
cat("The proportion of resampled chi-square values greater than the obtained chi-square is  ", signif, "\n")


# > source("C:/Documents and Settings/David Howell/My Documents/Professional/Chi Square/rowfixed.r")
#Pearson's chi-square statistic is  5.502327 
#That chi-square has a p value of  0.01899118 
#
#The proportion of chi-square values greater than the obtained chi-square is   0.0189 
#

   ###########
#   likelihood ratio chi-sq

LRchisq <- function(data) {
    test <- chisq.test(data)
    E <- test$expected
    O <- test$observed
    df <- test$parameter
    sigma = 0
    nrows <- nrow(data)
    ncols <- ncol(data)
    for (i in 1:nrows) {
      for (j in 1:ncols) {
          sigma <- sigma + O[i,j]*log(O[i,j]/E[i,j])
      }
    }
Gsquare <- 2*sigma


}
Gsquare <- LRchisq(origdata)

cat("The likelihood ratio chi-square statistic is ",Gsquare,"\n")
pLR <- pchisq(Gsquare, dfs, lower.tail = FALSE)
cat("That chi-square has a p value of ",pLR, "\n")

------------------------------------------------------------------------


#  No Marginals Fixed

# Analysis of contingency tables conditional only on the total sample size

# I have changed this to make the cell freq under H0 equal to (on average) the
# expected frequencies with those marginal totals.
# It is the only way that I can think of to set up a sampling scheme. BUT I am
# not restricting the sampling to those marginal totals.
# The qq plot fails if you change nreps away from 10000 because of the
# way I create length of variables.I should fix that some day.


#read in the data
origdata <- matrix(c(35, 9, 60, 41), nrow = 2, byrow = TRUE)
#origdata <- matrix(c(3,1,1,3), nrow = 2 )
#origdata <- read.table("DeathModified.dat", header = TRUE, nrows = 2)
#origdata <- read.table("Visintainer.dat", header = TRUE)
#origdata <- read.table("Darley.dat", header = TRUE)
#origdata <- matrix(c(12,2,1,3,2,4,3,4,187,17,11,9,16,7,8,6), nrow = 2, byrow = TRUE)
#origdata <- matrix(c(13,36,14,30), nrow = 2, byrow = TRUE)
#rownames(origdata) <- c("Drug","Placebo")
#colnames(origdata) <- c("Relapse","Success")
#origdata <- matrix(c(12,2,1,3,2,4,3,4,187,17,11,9,16,7,8,6), nrow = 2, byrow = TRUE)
# Establish sample size and number of rows and columns  (rs and cs)
N <- sum(origdata)
nr <- nrow(origdata)
nc <- ncol(origdata)
ncells <- nr*nc
# Establish row and column marginal totals
rowtots <- rowSums(origdata)
coltots <- colSums(origdata)
# Calculate expected cell probabilities based on original marginal totals
# and convert to vector
expected <- matrix(data = NA, nrow = nr, ncol = nc, byrow = TRUE)
for (i in 1:nr) {
  for (j in 1:nc) {
    expected[i,j] <- rowtots[i] * coltots[j] /N^2
    }
  }
# Convert to vector  
expectedvector <- as.vector(t(expected))
# This can now be sampled
            
# Calculate chi-square on original data
obtchisq <- chisq.test(origdata, correct = F)$statistic    

nreps <- 10000                    
xx <- numeric(N)
results <- numeric(nreps)
greater <- 0                
cells <- numeric(ncells)
cells <- c(1:ncells) 
# Now resample nreps times

for (i in 1:nreps) {
   xx <- sample(cells,N,replace = TRUE, prob = expectedvector)
   vv <- matrix(0,1,ncells)
   for (j in 1:N) {
      b <- xx[j]
      vv[b] = vv[b] + 1
    }
    x <- matrix(vv,nrow = nr, byrow = TRUE)
# x is now the contingency table for ith random sample

 results[i]<- chisq.test(x, correct = FALSE)$statistic
 greater <- ifelse(results[i] >= obtchisq, greater + 1, greater + 0)  
}

dfs <- (nr - 1)*(nc - 1)
cat("For a chi-square distribution the mean should be = ", dfs, ". It is actually ",mean(results, na.rm = TRUE),"\n")
cat("\n")
cat("For a chi-square distribution the var. should be = ", 2*dfs,". It is actually ",var(results, na.rm = "TRUE"), "\n")
cat("\n")

cat(" Look at the graphic for the distribution and a qq plot", "\n")
cat("\n")
par(mfrow=c(2,2))
hist(results, xlab = "Chi-Square", ylab = "Frequency", breaks = 50, main = "Chi-square distribution")
#
#
#
library(car)
qqPlot(results, dist = "chisq", df = dfs)
#pvalues <- c(1:9999)/10000         #generate a set of probabilities p = .00001 - .99999
#qquant <- qchisq(p = pvalues,df = 1)       #convert those to quantiles of chi-sq distrib.
##
#temp <- sort(results, na.last = NA)      #put the obtained quantiles in order
##
#temp <- temp[-10000]  #make them the same length
#plot(qquant,temp, main = "QQ Plot for Chi-Square", xlab = "Obtained Chi-Square",
#    ylab = "Expected Chi-Square") #make a qqplot
#abline(0,1)

 moreextreme <- results[results > obtchisq]
 moreextreme <- length(moreextreme)/nreps
 cat("The proportion of cases with results at least as extreme as our data 
 is  ",moreextreme, "\n")
cat("\n") 
#Using Pearson's chi-square
cat("The following is the result of using Pearson's basic chi-square.","\n")
cat("\n")
print(chisq.test(origdata, correct = FALSE))
#
#Using monte-carlo simulation
cat("The following is the result of letting R do a simulation for chisq.test","\n")
cat("\n")
print(chisq.test(origdata, simulate.p.value = TRUE, B = 10000))
#
cat("The following is the result of Fisher's exact test ","\n")
cat("\n")
print(fisher.test(origdata, conf.level = 0.95))
cat("\n")
cat("Do not be alarmed by warnings if you have some small marginals. ", "\n")
cat("They come because I used the chi-square statistic as my indicator of disparity.", "\n")

##########################################