# February 15
# Analysis of SAT state data
# In R, you can execute all of these commands by sourcing this file
# reading in the data
SAT = read.table("sat.txt",header=T)
# str shows structure of a data frame
str(SAT)
# attach allows variables to be used directly (sat instead of SAT$sat)
attach(SAT)
# first row
SAT[1,]
# first column
SAT[,1]
# all columns except the first
SAT[,-1]
SAT[,2:8]
# columns 2 and 7
SAT[,c(2,7)]
# rows where sat > 1000
SAT[ sat > 1000 , ]
# rows sat > 1000 and income > 300
SAT[ (sat > 1000) & (income > 300) , ]
# show scatter plots of all columns except the first
pairs(SAT[,-1])
# set graphing parameters to fill a 2 by 3 array of plots in row order
par(mfrow = c(2,3))
# make six plots with x variable ranging from 3 to 8
# names(SAT)[i] is the name of the variable in column i
for( i in 3:8) { plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16) }
# same as above, but make the label titles twice as large
for( i in 3:8) { plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16,cex.lab=2) }
# same as above, but add a red circle around each point corresponding to Wisconsin
# = is for assignment, == is to test equality
# here wisc is set to 49 because (state == "Wisconsin") is only true for the 49th observation
wisc = which(state == "Wisconsin")
for( i in 3:8) {
plot(SAT[,i],sat,xlab=names(SAT)[i],pch=16,cex.lab=2)
points(SAT[49,i],sat[49],cex=3,col="red")
}
# there seem to be important differences between states with high and low percentage of takers
# (takers < 25) are states where the ACT or another test is predominant, mostly in the midwest.
# let's analyze these two sets of states separately
# the symbol ! stands for NOT
# actStates is an array of 50 TRUE/FALSE values
# satStates is the opposite array of 50 TRUE/FALSE values
actStates = takers < 25
satStates = !actStates
# show the states in the actStates group;
state[actStates]
# create a new data frame with only these states
ACT = SAT[actStates,]
# show pairwise scatter plot for ACT states
# this plot still shows some potential non-linearity between takers and sat
pairs(ACT[,-1])
# Detach SAT and attach ACT so that variable names will refer to the ACT data frame
detach(SAT)
attach(ACT)
# examine structure of ACT to check that it looks right
str(ACT)
# fit a full model with more terms than we want
# the term I(takers^2) is a second order term
# that allows fitting a quadratic relationship between takers and sat
act.full = lm(sat ~ takers + income + years + public + expend + rank + I(takers^2),data=ACT)
# change the plotting back to one plot per page
# show the residual plot
par(mfrow=c(1,1))
plot(fitted(act.full),residuals(act.full),pch=16)
abline(h=0)
# The command plot(act.full,which=1) adds a lowess line that indicates possible nonlinearity
# The raw plot without the line does not look so bad --- patterns are often visible with small data sets
# One large positive residual seems to be potentially influential
# However, a plot of Cook's distance does not show any overly highly influential points.
plot(act.full,which=4)
# the summary of the full model indicates that there are too many variables included
# whenever this is the case, individual tests may not be reliable
summary(act.full)
# one method of model selection is AIC
# the function step can be used to search for the best model according to the AIC,
# but it is not guaranteed to find the best model within a particular scope.
# here, I will seek the best model beginning both from a null model with only the intercept
# as well as from the full model
# step called only with the full model begins with this full model
# and tries to drop and possibly readd terms until it can do neither
act.aic = step(act.full)
# step called with a smaller model and a formula for the scope makes the same search
# beginning at a different starting point.
act.scope = formula(sat ~ takers + income + years + public + expend + rank + I(takers^2))
act.0 = lm(sat ~ 1,data=ACT)
act.aic0 = step(act.0,scope=act.scope)
# in this case, both searches identify the same model
summary(act.aic)
summary(act.aic0)
# a residual plot of the fit looks decent
plot(fitted(act.aic),residuals(act.aic),pch=16)
abline(h=0)
# multicollinearity is when two or more explanatory variables
# are tightly clustered around a line (or plane) with fewer dimensions than the number of variables
# typically, this can occur when two variables measure essentially the same thing in slightly different ways.
# extreme correlation can indicate pairwise multicollinearity.
# other mehtods may be needed to identify higher-order multi-collinearity
# we see here that takers and rank are fairly strongly correlated,
# meaning that both are measuring similar aspects of the relationship
# as a consequence, neither estimated coefficient may be reliable
round(cor(ACT[,-1]),2)
# the drop1 function shows the effect on AIC of dropping any individual term from the model
drop1(act.aic)
# it is worth considering the two models found by dropping either rank or takers from the model
act.er = lm(sat ~ expend + rank,data=ACT)
act.et = lm(sat ~ expend + takers,data=ACT)
# compare summaries
# both models with rank or takers show that adding the first variable is most important ---
# adding the other helps (but not as much)
summary(act.aic)
summary(act.er)
summary(act.et)
# the estimated slopes can change markedly when terms are dropped
coef(act.aic)
coef(act.er)
coef(act.et)
# For the purpose of prediction, a model with all three terms is acceptable
# For a purpose of parameter estimation, it is preferable to select either takers or rank but not both
# We can repeat this using BIC
# BIC simply uses a larger penalty per parameter
# here, the best models according to BIC agree with those from AIC
act.n = nrow(ACT)
act.bic = step(act.full,k=log(act.n))
act.bic0 = act.aic0 = step(act.0,scope=act.scope,k=log(act.n))
summary(act.bic)
summary(act.bic0)
# Let's repeat the analysis for the SAT states
detach(ACT)
satStates = !actStates
SATprimary = SAT[satStates,]
attach(SATprimary)
# plots show extreme values related to Alaska, especially in several variables
# Alaska is certainly an outlier
pairs(SATprimary[,-1])
# Two searches for AIC find different models
sat.scope = formula(sat ~ takers + income + years + public + expend + rank)
Sat.full = lm(sat.scope,data=SATprimary)
sat.aic = step(sat.full)
sat.0 = lm(sat ~ 1,data=SATprimary)
sat.aic0 = step(sat.0,scope=sat.scope,data=SATprimary)
# AIC is only defined up to a constant
# extractAIC and AIC disagree on this constant
# extractAIC agrees with AIC computed using step
# compare AIC for the two models
# sat.aic is better
extractAIC(sat.aic)
extractAIC(sat.aic0)
summary(sat.aic)
summary(sat.aic0)
# Cook's distance from a model using only expend shows Alaska is influential
plot(sat.aic0,which=4)
# we should drop Alaska to see the difference
detach(SATprimary)
noAlaska = (satStates & SAT$state != "Alaska")
SATalaska = SAT[noAlaska,]
attach(SATalaska)
pairs(SATalaska[,-1])
noa.full = lm(sat.scope,data=SATalaska)
noa.aic = step(noa.full)
noa.0 = lm(sat ~ 1,data=SATalaska)
noa.aic0 = step(noa.0,scope=sat.scope,data=SATalaska)