# MAN 2/13/07 # Some code to consider confidence intervals for the correlation # parameter # data (X_i,Y_i) bv normal with correlation rho # i = 1,...,n # # r is Pearson correlation coefficient # # 6 intervals # # # a. (L_n,U_n) = r +/- 2 (1-r^2)/sqrt(n) # # based on sqrt(n)[ r - rho ] -> N[0, (1-rho^2)^2 ] # # b. L_n = [ exp{ 2 (s-2/sqrt(n) )} - 1 ]/[ exp{ 2 (s-2/sqrt(n)) } + 1 ] # U_n = [ exp{ 2 (s+2/sqrt(n) )} - 1 ]/[ exp{ 2 (s+2/sqrt(n)) } + 1 ] # # where s = .5*log[ (1+r)/(1-r) ] # # c. normal bootstrap of a: r +/- 2*bootstrap SE # # d. normal bootstrap of b: invert s +/- s*bootstrap SE(s) # # e. bootstrap percentile of c # f. bootstrap percentile of d # function to compute interval a. int.a <- function(x,y) { r <- cor(x,y) n <- length(x) se <- (1-r^2)/sqrt(n) ci <- r + c(-2,2)*se return( ci ) } # function to compute interval b. int.b <- function(x,y) { r <- cor(x,y) n <- length(x) s <- .5*log( (1+r)/(1-r) ) jj <- s + c(-2,2)/sqrt(n) ci <- ( exp( 2*jj ) - 1 )/( exp( 2*jj ) + 1 ) return( ci=ci ) } # function to compute intervals c-f boot <- function(x,y,B=1000) { rboot <- numeric(B) n <- length(x) for( b in 1:B ) { bb <- sample( 1:n, size=n, replace=T ) rboot[b] <- cor( x[bb], y[bb] ) } sboot <- .5*log( (1+rboot)/(1-rboot) ) se.r <- sd( rboot ) se.s <- sd( sboot ) ## use bootstrap just to estimate standard error, then get CI r <- cor(x,y) ci.c <- r + c(-2,2)*se.r s <- .5*log( (1+r)/(1-r) ) jj <- s + c(-2,2)*se.s ci.d <- ( exp( 2*jj ) - 1 )/( exp( 2*jj ) + 1 ) ## use bootstrap for percentile interval ci.e <- ( sort(rboot) )[ c( .025*B, .975*B ) ] jj <- ( sort(sboot) )[ c( .025*B, .975*B ) ] ci.f <- ( exp( 2*jj ) - 1 )/( exp( 2*jj ) + 1 ) return( list( ci.c = ci.c, ci.d = ci.d, ci.e = ci.e, ci.f = ci.f ) ) } ### try it on one data set n <- 50 rho <- .4 x <- rnorm(n) y <- rho*x + sqrt(1-rho^2)*rnorm(n) int.a(x,y) int.b(x,y) boot(x,y) #### OK, now try a simulation n <- 20 rho <- 0.9 nsim <- 10000 ## maybe slow...!! B <- 5000 covered <- matrix(NA, nsim, 6 ) #indicators of coverage lengths <- matrix(NA, nsim, 6 ) # length of intervals dimnames(covered)[[2]] <- c("a","b","c","d","e","f") dimnames(lengths)[[2]] <- c("a","b","c","d","e","f") is.cov <- function( ci, rho ) { ok <- 1*( ( rho >= ci[1] )&( rho <= ci[2] ) ) return(ok) } c.len <- function( ci ) { return( ci[2] - ci[1] ) } for( i in 1:nsim ) { # make data x <- rnorm(n) y <- rho*x + sqrt(1-rho^2)*rnorm(n) # get intervals i.a <- int.a(x,y) i.b <- int.b(x,y) i.cdef <- boot(x,y,B=B) # check coverage covered[i, 1] <- is.cov( i.a , rho=rho ) covered[i, 2] <- is.cov( i.b , rho=rho ) covered[i, 3:6] <- sapply(i.cdef, is.cov, rho=rho ) # record lengths lengths[i,1] <- c.len( i.a ) lengths[i,2] <- c.len( i.b ) lengths[i, 3:6] <- sapply(i.cdef, c.len ) print(i) } covs <- apply(covered,2,mean) m.lens <- apply( lengths, 2, mean ) # store the results res <- list( covs=covs, lengths=m.lens, rho=rho, n=n, B=B, nsim=nsim ) dput(res,"results-cor-boot.txt") ## once done, you can retrieve using `dget'