# Bret Larget # April 15, 2009 # R code for contingency table inference # The model: # P(X=i,Y=j) = theta_{ij} # H_0: P(X=i,Y=j) = P(X=i)P(Y=j) = theta_{i.}*theta_{.j} # # MLE under null model, (theta_{i.}) # Analysis of the German Eye/Hair color data # The data # Hair Color # Brown Black Fair Red # ----------+------+------+----+-----+ # Eye color Brown | 438 288 115 16 | # Gray/Green| 1347 746 946 53 | # Blue | 807 189 1768 47 | # ----------+------+------+----+-----+ # Put the observed data in a matrix named `o' # o = matrix(c(438,1387,807,288,746,189,115,946,1768,16,53,47),3,4) # n is the sample size n = sum(o) # x is the row sums x = apply(o,1,sum) # y is the column sums y = apply(o,2,sum) # the expected counts under the null model # note the use of the outer product operator x %o% y which produces a matrix with ij element x[i]*y[j] e = (x %o% y)/n # The chi-square test statistic is sum of (observed-expected)^2/expected w = sum( (o-e)^2/e ) # The number of degrees of freedom is (a-1)*(b-1). # p-value from a chi-square distribution. df.1 = (length(x)-1)*(length(y)-1) p.1 = 1 - pchisq(w,df.1) # Also examine the standardized residuals theta.null = e/n r = (o-e)/sqrt(e*(1-theta.null)) # Print some results cat("************************************************************\n") cat("Observed Data\n\n") print(o) cat("************************************************************\n") cat("Expected Counts (rounded)\n\n") print(round(e,1)) cat("************************************************************\n") cat("Standardized residuals (rounded)\n\n") print(round(r,2)) cat("************************************************************\n") cat("Chi-square test statistic\n\n") print(w) cat("************************************************************\n") cat("Degrees of freedom\n\n") print(df.1) cat("************************************************************\n") cat("P-value\n\n") print(signif(p.1,2)) ############################################################ # Now compare to a likelihood ratio test # Likelihood = prod(theta^o) # log-likelihood = sum(o*log(theta)) # theta.null already computed above, theta.null = e/n since e = n*theta.null # theta.full uses observed counts theta.full = o/n # Find log-likelihoods for null and full models logl.null = sum(o*log(theta.null)) logl.full = sum(o*log(theta.full)) # Test statistic w.2 = 2*(logl.full - logl.null) # p.value p.2 = 1 - pchisq(w.2,df.1) # print results cat("************************************************************\n") cat("Likelihood ratio test results\n\n") cat("******************************\n") cat("Null log-likelihood = ") cat(round(logl.null,2)) cat("\n") cat("Full log-likelihood = ") cat(round(logl.full,2)) cat("\n") cat("Test statistic\n\n") print(w.2) cat("******************************\n") cat("p-value\n\n") print(p.2) ################################################################################ # Compare to results using chisq.test() ht = chisq.test(o) cat("************************************************************\n") cat("Results from chisq()") print(ht)