# stat310.R # Bret Larget # Spring 2009 semester # Updated February 19, 2009 # This file will contain a number of R functions and tools useful for # homework and projects. ################################################################################ # easyLayout() --- function to arrange a list of graphs made using lattice # in an array # Arguments: # x = a list of graphics objects # nrow = number of rows for the graphs # ncol = number of columns for the graph # Return Value: # none # # EXAMPLE: # # > require(lattice) # > x1 = rnorm(1000,100,15) # > y1 = x1 + rnorm(1000) # > fig1 = histogram(~x1,type="density") # > fig2 = histogram(~x1,nint=5,col="red",xlab="Score",type="density") # > fig3 = densityplot(x1) # > fig4 = xyplot(y1 ~ x1,pch=".") # > easyLayout(list(fig1,fig2,fig3,fig4),2,2) ################################################################################ easyLayout = function(x,nrow,ncol) { require(grid) grid.newpage() pushViewport(viewport(layout=grid.layout(nrow,ncol))) z = 1 for(i in 1:nrow) { for(j in 1:ncol) { pushViewport(viewport(layout.pos.row=i,layout.pos.col=j)) if(z <= length(x)) { print(x[[z]],newpage=F) z = z+1 } popViewport() } } popViewport() } ################################################################################ # locationNormal.logl() --- function to compute likelihood or loglikelihood # from normal model with sigma0 known # # locationNormal.logl() is intended to be called as the argument f in likelihoodPlot() # # Arguments: # theta = mean of normal model (mu). Can be a vector. # s = the data, a sample assumed iid N(theta,sigma0^2) # log.like = boolean to return loglikelihood (T) or likelihood (F) # # Return Value: # A vector of length length(theta) with the (log)likelihood for each input theta # ################################################################################ locationNormal.logl = function(theta,s,sigma0=1,log.like=T,...) { s.n = length(s) s.mean = mean(s) s.var = var(s)*(s.n-1)/s.n logl = -(s.n/2)*log(2*pi*sigma0^2) - (s.n/(2*sigma0^2))*(s.var+(s.mean-theta)^2) if(log.like) return(logl) else return(exp(logl)) } ################################################################################ # poisson.logl() --- function to compute likelihood or loglikelihood # from Poisson # # poisson.logl() is intended to be called as the argument f in likelihoodPlot() # # Note that x! = Gamma(x+1), and that R has lgamma() = log( Gamma() ) as a built in function. # Hence, x! = exp(lgamma(x+1)) in R. # # Arguments: # theta = mean of Poisson model (mu). Can be a vector. # s = the data, a sample assumed iid Poisson(theta) # log.like = boolean to return loglikelihood (T) or likelihood (F) # # Return Value: # A vector of length length(theta) with the (log)likelihood for each input theta # ################################################################################ poisson.logl = function(theta,s,log.like=T,...) { s.n = length(s) s.sum = sum(s) logl = -s.n*theta + s.sum*log(theta) - sum(lgamma(s+1)) if(log.like) return(logl) else return(exp(logl)) } ################################################################################ # exponential.logl() --- function to compute likelihood or loglikelihood # from an Exponential sample # # exponential.logl() is intended to be called as the argument f in likelihoodPlot() # # Arguments: # theta = rate parameter of Exponential model (lambda). Can be a vector. # s = the data, a sample assumed iid Exponential(lambda) # log.like = boolean to return loglikelihood (T) or likelihood (F) # # Return Value: # A vector of length length(theta) with the (log)likelihood for each input theta # ################################################################################ exponential.logl = function(theta,s,log.like=T,...) { s.n = length(s) s.sum = sum(s) logl = -s.sum*theta + s.n*log(theta) if(log.like) return(logl) else return(exp(logl)) } ################################################################################ # hardy.weinberg.logl() --- function to compute likelihood or loglikelihood # for the Hardy-Weinberg law in genetics # # hardy.weinberg.logl() is intended to be called as the argument f in likelihoodPlot() # # Arguments: # theta = parameter, P(red) = theta^2, P(pink) = 2*theta*(1-theta), P(white) = (1-theta)^2 # s = the count data of length 3, s=c(r,p,w) where r = #red, p=#pink, w=#white # Also, s[1] = r, s[2] = p, s[3] = w # log.like = boolean to return loglikelihood (T) or likelihood (F) # # Return Value: # A vector of length length(theta) with the (log)likelihood for each input theta # ################################################################################ hardy.weinberg.logl = function(theta,s,log.like=T,...) { n = sum(s) # = r + p + w r = s[1] # = #red flowers in sample p = s[2] # = #pink flowers in sample w = s[3] # = #white flowers in sample logl = r*2*log(theta) + p*log(2*theta*(1-theta)) + w*2*log(1-theta) if(log.like) return(logl) else return(exp(logl)) } ################################################################################ # likelihoodPlot() --- function to plot a likelihood or loglikelihood function # # Arguments: # endpoints = a vector of length 2 with the lower and upper bounds for the parameter # s = the data, which is an argument to the likelihood function # f = the likelihood (or loglikelihood) function. It must have theta and s as arguments. # theta will be generated in sequence between the endpoints. # log.like = F if f() returns a likelihood, T if f() returns a log. # ... = additional parameters that will be passed on to f # Return Value: # the graph object # # EXAMPLE: # > set.seed(343) # > s = rnorm(10,25,5) # > s # > mean(s) # > likelihoodPlot(c(15,35),s,locationNormal.logl,sigma0=5) # > likelihoodplot(c(15,35),s,locationNormal.logl,sigma0=5,log.like=F) ################################################################################ likelihoodPlot = function(endpoints,s,f,log.like=T,...) { require(lattice) theta = seq(endpoints[1],endpoints[2],length=501) logl = f(theta=theta,s=s,log.like=log.like,...) ylab = ifelse(log.like,"log-likelihood","likelihood") fig = xyplot(logl ~ theta,ylab=ylab,type="l",...) print( fig ) return(invisible(fig)) } ################################################################################ # binomialSimPlot() --- function to plot within binomialSim() # # Arguments: # n = n in the Binomial model # theta = theta in the Binomial model (success probability for one trial) # s = the data, which are an iid Binomial(n,theta) sample # conf = the desired confidence level # # Return Value: # a list of the graph object and the count of intervals that contain theta # ################################################################################ binomialSimPlot = function(n,theta,s,conf=0.95) { require(lattice) theta.hat = s/n se.hat = sqrt(theta.hat*(1-theta.hat)/n) z = qnorm(1 - (1-conf)/2) low = theta.hat - z*se.hat high = theta.hat + z*se.hat contain = (low <= theta) & (high >= theta) foo = data.frame(index=1:length(s),theta.hat=theta.hat,low=low,high=high) fig = xyplot(low+high ~ index, data=foo, xlab="Index", ylab="Proportion", panel=function(...) { with(foo, panel.xyplot(c(index,index),c(low,high),type="n")) panel.abline(h=theta,lty=2) for(i in 1:length(s)) { col = ifelse(contain[i],"blue","red") with(foo, panel.segments(index[i],low[i],index[i],high[i],col=col) ) with(foo, panel.points(index[i],theta.hat[i],col=col) ) } } ) return(invisible(list(fig=fig,count=sum(contain)))) } ################################################################################ # binomialSimPlot() --- function to plot within binomialSim() # # Arguments: # nsim = size of the simulation (number of samples) # n = n in the Binomial model # theta = theta in the Binomial model (success probability for one trial) # conf = the desired confidence level # plot = boolean to indicate of the intervals should be plotted or not # # Return Value: # a the count of the intervals containing theta # # EXAMPLE: # > set.seed(343) # > binomialSimulation(10000,20,0.3,plot=F) # ################################################################################ binomialSimulation = function(nsim,n,theta,conf=0.95,plot=T) { x = rbinom(nsim,n,theta) out = binomialSimPlot(n,theta,x,conf) if(plot) print(out$fig) print( paste(out$count,"of",nsim,"confidence intervals contain theta =",theta) ) return(invisible(out$count)) } ################################################################################ # normal.power() --- function to calculate power from the normal model (variance known) # plot.normal.power() --- function to plot the power using the previous function # # Arguments: # theta = mean for which power is computed # endpoints = endpoints for theta for the plot # n = sample size # mu0 = mean under null hypothesis # sigma0 = known sd # alpha = level of significance test # # Return Value: # a data frame with theta and the corresponding power as variables # # EXAMPLE: # > plot.normal.power(endpoints=c(-5,5), n=25, mu0=0, sigma0=3, alpha=0.01) # ################################################################################ normal.power = function(theta,n,mu0,sigma0=1,alpha=0.05) { foo = (mu0-theta)/(sigma0/sqrt(n)) z = qnorm(1-alpha/2) return(1 - pnorm(foo+z) + pnorm(foo-z)) } plot.normal.power = function(endpoints,n,mu0,sigma0=1,alpha=0.05) { theta = seq(endpoints[1],endpoints[2],length=501) pow = normal.power(theta,n,mu0,sigma0,alpha) require(lattice) print( xyplot(pow ~ theta, type="l", col="blue", xlab="theta", ylab = "Power", ylim=c(-0.1,1.1)) ) return(invisible(data.frame(theta=theta,power=pow))) } ################################################################################ # functions for use with boot() in library(boot) in the example # first argument is the original data # second argument is the set of indices selected from a bootstrap sample # # get.q90() returns the sample 0.90 quantile from vector x sampled at indices # get.mean() returns the sample mean from vector x sampled at indices # # In the following example, the file tornado includes two columns, year and count, # with the count of the number of recorded tornados in Wisconsin for the specified year. # Data is collected for each year from 1950-1995. # # EXAMPLE # # > library(boot) # load the boot library with its bootstrap functions # > tornado = read.table("wi-tornado.txt", header=T) # read in a set of data # > with(tornado, boot(count,get.q90,R=10000) # compute 10,000 bootstrap statistics # ################################################################################ get.q90 = function(x,indices) { return(quantile(x[indices],0.9)) } get.mean = function(x,indices) { return(mean(x[indices])) } rate.exponential = function(x) { return(1/mean(x)) } gen.exponential = function(x,mle) { return(rexp(length(x),rate=mle$rate)) } ################################################################################ # exact.binomial.coverage() --- # Function to compute coverage probability of binomial confidence interval exactly. # # C.I. = ((X+adj)/(n+2*adj)) +/- z*sqrt( ((X+adj)/(n+2*adj))(1 - (X+adj)/(n+2*adj))/(n+2*adj) ) # P(coverage) = P( (n.adj+z^2)*(X+adj)^2 - (2*theta*n.adj^2+n.adj*z^2)*(X+adj) + n.adj^3*theta^2 <= 0 ) # where X ~ Binomial(n,theta) and n.adj = n + 2*adj # # Traditional method is adj=0 # Some textbooks recommend using adj=2; this is equivalent to using the traditional method # on modified data where 2 observations have been added to both the observed number of successes and failures. # (So traditional method where n is replaced by n+4, and x is replaced by x+2 # # This function allows an arbitrary adjustment to be added equally to both groups. # # Arguments: # n = true Binomial parameter for number of trials # theta = true Binomial probability of success in a single trial # adj = adjustment to be made to observed numbers of successes and failures # conf = confidence level (between 0 and 1) # # Note: either n or theta can be specified as a vector, so many calculations can be done at one time. # If both are vectors, the code does not fail, but the results may not be what you expect. # # Return value: # vector of true coverage probabilities # # EXAMPLE # # > exact.binomial.coverage(21,0.3) # > exact.binomial.coverage(21,0.3,adj=2) # > exact.binomial.coverage(100,seq(0.05,0.95,0.05)) # > exact.binomial.coverage(100,seq(0.05,0.95,0.05),adj=2) # > theta = seq(0.05,0.95,0.001) # > library(lattice) # > b = data.frame(theta=theta,traditional=exact.binomial.coverage(100,theta),adjusted=exact.binomial.coverage(100,theta,adj=2)) # > xyplot(traditional + adjusted ~ theta, data=b, type="l", auto.key=T) # # # Example with a nice label and a horizontal line at the target probability # > theta = seq(0.05,0.95,0.001) # > b = data.frame(theta=theta,traditional=exact.binomial.coverage(100,theta),adjusted=exact.binomial.coverage(100,theta,adj=2)) # > fig = xyplot(traditional + adjusted ~ theta, data=b, type="l", auto.key=T, ylab="coverage probability", # > panel=function(x,y,...) { # > panel.xyplot(x,y,...) # > panel.abline(h=0.95,col="red",lty=2) # > } # > ) # > print( fig ) ################################################################################ exact.binomial.coverage = function(n,theta,adj=0,conf=0.95) { z = qnorm(1 - (1-conf)/2) n.adj = n+2*adj a1 = (n.adj+z^2) b1 = -2*theta*n.adj^2 - n.adj*z^2 c1 = n.adj^3*theta^2 d = sqrt(b1^2 - 4*a1*c1) return( pbinom((-b1+d)/(2*a1) -adj,n,theta) - pbinom((-b1-d)/(2*a1)-adj,n,theta) ) }