# February 15 # Analysis of SAT state data # In R, you can execute all of these commands by sourcing this file # reading in the data SAT = read.table("sat.txt",header=T) # str shows structure of a data frame str(SAT) # attach allows variables to be used directly (sat instead of SAT$sat) attach(SAT) # first row SAT[1,] # first column SAT[,1] # all columns except the first SAT[,-1] SAT[,2:8] # columns 2 and 7 SAT[,c(2,7)] # rows where sat > 1000 SAT[ sat > 1000 , ] # rows sat > 1000 and income > 300 SAT[ (sat > 1000) & (income > 300) , ] # show scatter plots of all columns except the first pairs(SAT[,-1]) # set graphing parameters to fill a 2 by 3 array of plots in row order par(mfrow = c(2,3)) # make six plots with x variable ranging from 3 to 8 # names(SAT)[i] is the name of the variable in column i for( i in 3:8) { plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16) } # same as above, but make the label titles twice as large for( i in 3:8) { plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16,cex.lab=2) } # same as above, but add a red circle around each point corresponding to Wisconsin # = is for assignment, == is to test equality # here wisc is set to 49 because (state == "Wisconsin") is only true for the 49th observation wisc = which(state == "Wisconsin") for( i in 3:8) { plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16,cex.lab=2) points(SAT[49,i],sat[49],cex=3,col="red") } # there seem to be important differences between states with high and low percentage of takers # (takers < 25) are states where the ACT or another test is predominant, mostly in the midwest. # let's analyze these two sets of states separately # the symbol ! stands for NOT # actStates is an array of 50 TRUE/FALSE values # satStates is the opposite array of 50 TRUE/FALSE values actStates = takers < 25 satStates = !actStates # show the states in the actStates group; state[actStates] # create a new data frame with only these states ACT = SAT[actStates,] # show pairwise scatter plot for ACT states # this plot still shows some potential non-linearity between takers and sat pairs(ACT[,-1]) # Detach SAT and attach ACT so that variable names will refer to the ACT data frame detach(SAT) attach(ACT) # examine structure of ACT to check that it looks right str(ACT) # fit a full model with more terms than we want # the term I(takers^2) is a second order term # that allows fitting a quadratic relationship between takers and sat act.full = lm(sat ~ takers + income + years + public + expend + rank + I(takers^2),data=ACT) # change the plotting back to one plot per page # show the residual plot par(mfrow=c(1,1)) plot(fitted(act.full),residuals(act.full),pch=16) abline(h=0) # The command plot(act.full,which=1) adds a lowess line that indicates possible nonlinearity # The raw plot without the line does not look so bad --- patterns are often visible with small data sets # One large positive residual seems to be potentially influential # However, a plot of Cook's distance does not show any overly highly influential points. plot(act.full,which=4) # the summary of the full model indicates that there are too many variables included # whenever this is the case, individual tests may not be reliable summary(act.full) # one method of model selection is AIC # the function step can be used to search for the best model according to the AIC, # but it is not guaranteed to find the best model within a particular scope. # here, I will seek the best model beginning both from a null model with only the intercept # as well as from the full model # step called only with the full model begins with this full model # and tries to drop and possibly readd terms until it can do neither act.aic = step(act.full) # step called with a smaller model and a formula for the scope makes the same search # beginning at a different starting point. act.scope = formula(sat ~ takers + income + years + public + expend + rank + I(takers^2)) act.0 = lm(sat ~ 1,data=ACT) act.aic0 = step(act.0,scope=act.scope) # in this case, both searches identify the same model summary(act.aic) summary(act.aic0) # a residual plot of the fit looks decent plot(fitted(act.aic),residuals(act.aic),pch=16) abline(h=0) # multicollinearity is when two or more explanatory variables # are tightly clustered around a line (or plane) with fewer dimensions than the number of variables # typically, this can occur when two variables measure essentially the same thing in slightly different ways. # extreme correlation can indicate pairwise multicollinearity. # other mehtods may be needed to identify higher-order multi-collinearity # we see here that takers and rank are fairly strongly correlated, # meaning that both are measuring similar aspects of the relationship # as a consequence, neither estimated coefficient may be reliable round(cor(ACT[,-1]),2) # the drop1 function shows the effect on AIC of dropping any individual term from the model drop1(act.aic) # it is worth considering the two models found by dropping either rank or takers from the model act.er = lm(sat ~ expend + rank,data=ACT) act.et = lm(sat ~ expend + takers,data=ACT) # compare summaries # both models with rank or takers show that adding the first variable is most important --- # adding the other helps (but not as much) summary(act.aic) summary(act.er) summary(act.et) # the estimated slopes can change markedly when terms are dropped coef(act.aic) coef(act.er) coef(act.et) # For the purpose of prediction, a model with all three terms is acceptable # For a purpose of parameter estimation, it is preferable to select either takers or rank but not both # We can repeat this using BIC # BIC simply uses a larger penalty per parameter # here, the best models according to BIC agree with those from AIC act.n = nrow(ACT) act.bic = step(act.full,k=log(act.n)) act.bic0 = act.aic0 = step(act.0,scope=act.scope,k=log(act.n)) summary(act.bic) summary(act.bic0) # Let's repeat the analysis for the SAT states detach(ACT) satStates = !actStates SATprimary = SAT[satStates,] attach(SATprimary) # plots show extreme values related to Alaska, especially in several variables # Alaska is certainly an outlier pairs(SATprimary[,-1]) # Two searches for AIC find different models sat.scope = formula(sat ~ takers + income + years + public + expend + rank) Sat.full = lm(sat.scope,data=SATprimary) sat.aic = step(sat.full) sat.0 = lm(sat ~ 1,data=SATprimary) sat.aic0 = step(sat.0,scope=sat.scope,data=SATprimary) # AIC is only defined up to a constant # extractAIC and AIC disagree on this constant # extractAIC agrees with AIC computed using step # compare AIC for the two models # sat.aic is better extractAIC(sat.aic) extractAIC(sat.aic0) summary(sat.aic) summary(sat.aic0) # Cook's distance from a model using only expend shows Alaska is influential plot(sat.aic0,which=4) # we should drop Alaska to see the difference detach(SATprimary) noAlaska = (satStates & SAT$state != "Alaska") SATalaska = SAT[noAlaska,] attach(SATalaska) pairs(SATalaska[,-1]) noa.full = lm(sat.scope,data=SATalaska) noa.aic = step(noa.full) noa.0 = lm(sat ~ 1,data=SATalaska) noa.aic0 = step(noa.0,scope=sat.scope,data=SATalaska)