# Logistic Regression CaseStudy # Bret Larget # February 19, 2008 # # Objective is to study variables from a storm that can predict # if there will be runoff. # # Data is collected over a four-year period from a home in Madison, Wisconsin # For each rain storm, data is collected # For our purposes today, we want to predict a binary response, whether or not there is runoff. # There is also data of what the runoff is when there is some. # # Read in the data # runoff = read.table("Runoff1.csv",header=T,sep=",") str(runoff) summary(runoff) # StormDuration --- How long the storm lasts in minutes # LastStorm --- Time since the previous storm in minutes # Precip --- Amount of rainfall in inches # MaxIntensity5 --- Hourly rain amount if maximum rainfall in 5 minute period lasted for an hour # MaxIntensity10 --- Hourly rain amount if maximum rainfall in 10 minute period lasted for an hour # MaxIntensity15 --- Hourly rain amount if maximum rainfall in 15 minute period lasted for an hour # MaxIntensity30 --- Hourly rain amount if maximum rainfall in 30 minute period lasted for an hour # MaxIntensity60 --- Hourly rain amount if maximum rainfall in 60 minute period lasted for an hour # EI --- Unknown # Runoff --- Amount of runoff in liters # RunoffEvent --- Indicator if there is a positive runoff # From the original data, we found several problems # -9 had been used for missing values; it is better to use NA # StormDuration had one value of over 300,000 which is 200 days # --- the study had been discontinued from April to December 2004 # --- changed value to missing for the first December storm after recontinuation of the study # --- other large observations are real (snow storms do not count, so measures for early spring # or winter storms can be misleading as there would be precipitation. # # Read in clean data # runoff = read.table("runoffClean.txt",header=T) str(runoff) summary(runoff) # # Make some plots of data # library(lattice) # Side-by-side boxplots are one way to compare distributions # It is preferable to use the horizontal axis to show the most important variable # This graph shows two boxplots, one for the events where there is runoff, one for those where there is not # Box-and-whisker plots only show outliers and quartiles ... you cannot see modes or shape # # Recall that we must print() or plot() lattice plotting calls read in from a script plot( bwplot(factor(RunoffEvent) ~ StormDuration,runoff) ) # Histograms are another choice # This puts histograms in two separate panels plot( histogram(~ StormDuration | factor(RunoffEvent),runoff) ) # We can adjust the layout to 1 column and 2 rows for better comparison plot( histogram(~ StormDuration | factor(RunoffEvent),runoff,layout=c(1,2)) ) # A density plot can be advantageous over a histogram # as it does not depend on arbitrary choices for bin width or starting points. # Here are two separate panels, stacked on top of one another plot( densityplot(~ StormDuration | factor(RunoffEvent),runoff,layout=c(1,2)) ) # This overlays two density plots in the same panel plot( densityplot(~ StormDuration,groups=factor(RunoffEvent),data=runoff) ) # Add a key plot( densityplot(~ StormDuration,groups=factor(RunoffEvent),data=runoff,auto.key=T) ) # Display the key more nicely in two columns plot( densityplot(~ StormDuration,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) # Plots show that Storm Density is not highly predictive for indicating a runoff event # # Consider other variables # # LastStorm = time since previous storm in minutes plot( densityplot(~ LastStorm,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) # Change to days for better interpretability plot( densityplot(~ LastStorm/60/24,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) # Add a title plot( densityplot(~ LastStorm/60/24,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2),main="Title") ) # Here, there is a long right tail for the non-event distribution # This corresponds to measures in the winter # LastStorm does not look very predictive # Precip is the total storm precipitation in inches plot( densityplot(~ Precip,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) # Clearly predictive # Storms above some threshhold always produce runoff # Interesting to see why some low total precipitation storms might produce runoff # All intensity plots look informative and look to contain very similar information plot( densityplot(~ MaxIntensity5,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) plot( densityplot(~ MaxIntensity10,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) plot( densityplot(~ MaxIntensity15,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) plot( densityplot(~ MaxIntensity30,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) plot( densityplot(~ MaxIntensity60,groups=factor(RunoffEvent),data=runoff,auto.key=list(columns=2)) ) # # Examine correlations between intensity variables # # This would fail because of NAs in the data # cor(runoff[,4:8]) # This fails because na.rm is the wrong argument # cor(runoff[,4:8],na.rm=T) # Here is how to access help about the command # ?cor # This command eliminates NAs on a pairwise basis print( cor(runoff[,4:8],use="p") ) # Same command, but rounded to two decimal places plot( round(cor(runoff[,4:8],use="p"),2) ) # Command to show a scatter plot matrix plot( splom(runoff[,4:8]) ) # # Fit logistic regression models # # First model uses only Precip as an input fit1 = glm(RunoffEvent ~ Precip,data=runoff,family=binomial) # Display the results library(arm) print( display(fit1) )