# Bret Larget # April 24, 2009 # Regression example with my son's growth data # Variables are age (in months) and height (in inches) # Read in the data riley = read.table("riley.txt",header=T) # Plot the data with an added regression line library(lattice) print( xyplot(height ~ age, riley, pch=16, type=c("p","r")) ) # Fit a linear model fit = lm(height ~ age, data=riley) # Print a summary of the fitted model # The main part of the output is as follows: # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 31.106153 0.332111 93.66 <2e-16 # age 0.234774 0.004218 55.66 <2e-16 # # (Intercept) *** # age *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # Residual standard error: 0.549 on 19 degrees of freedom # Multiple R-squared: 0.9939, Adjusted R-squared: 0.9936 # F-statistic: 3097 on 1 and 19 DF, p-value: < 2.2e-16 # To read the data from the table, the estimated regression line is: # # (predicted height in inches) = 31.1 + 0.2348*(age in months) print(summary(fit)) # Verify the numerical estimates age.mean = with(riley, mean(age)) age.sd = with(riley,sd(age)) height.mean = with(riley,mean(height)) height.sd = with(riley,sd(height)) r = with(riley, cor(age,height)) b2 = r*height.sd/age.sd b1 = height.mean - b2*age.mean print(c(b1,b2)) # Standard errors # A 95% confidence interval for beta_2 is # 0.2348 +/- 2.09*(0.004218) # where qt(0.975,19) = 2.09 # Hypothesis test # H_0: beta_2 = 0 # H_A: beta_2 != 0 # T = 0.234774/0.004218 = 55.66 # p-value = P(|T| > 55.66) when T ~ t(19), which is essentially 0. # # Use of residuals() and fitted() # # The residuals can be found using residuals. n = with(riley, length(age)) s2 = sum(residuals(fit)^2)/(n-2) # Proportion of variance explained by the regression # Compare with(riley, 1 - var(residuals(fit))/var(height)) # with with(riley, 1 - s2/var(height)) # and look for these numbers as the multiple R^2 and adjusted R^2 estimates. # # Residual Plots # # Notice this pattern in the residual plots. # Residuals are negative with small and high ages # and positive for middle ages. # This means that fitting a curve (a parabola) willl fit better. print(xyplot(residuals(fit) ~ age, data=riley,pch=16)) # Here is a fancier plot that adds a horizontal line at 0 resid.plot = xyplot(residuals(fit) ~ age, data=riley,pch=16, panel = function(x,y,...) { panel.xyplot(x,y,...) panel.abline(h=0) } ) print(resid.plot) # Quadratic fit # Note that the coefficient for age^2 is also significant fit2 = lm(height ~ age + I(age^2), data=riley) summary(fit2)