3 Multiple linear regression
Packages required (plus any dependencies)
DAAG car MASS AICcmodavg leaps BayesFactor splines
Additionally, Hmisc and knitr are required in order to process the Rmd source file.
Section 3.1 Basic ideas: the allbacks book weight data
<- DAAG::allbacks # Place the data in the workspace
allbacks <- lm(weight ~ volume+area, data=allbacks)
allbacks.lm print(coef(summary(allbacks.lm)), digits=2)
<- range(allbacks$volume)
xlim <- xlim+c(-.075,.075)*diff(xlim)
xlim ## Plot of weight vs volume: data frame allbacks (DAAG)
plot(weight ~ volume, data=allbacks, pch=c(16,1)[unclass(cover)],
lwd=1.25, xlim=xlim, fg="gray")
## unclass(cover) gives the integer codes that identify levels
## As text() does not accept the parameter data, use with()
## to specify the data frame.
with(allbacks, text(weight ~ volume, labels=paste(1:15), cex=0.75, offset=0.35,
pos=c(2,4)[unclass(cover)]))
legend(x='topleft', pch=c(16,1), legend=c("hardback ","softback"),
horiz=T, bty="n", xjust=0.5, x.intersp=0.75, )
## Correlations between estimates -- model with intercept
round(summary(allbacks.lm, corr=TRUE)$correlation, 3)
<- capture.output(summary(allbacks.lm,digits=2))
out cat(out[15:17], sep='\n')
## 5% critical value; t-statistic with 12 d.f.
qt(0.975, 12)
cat(out[5:7], sep='\n')
Subsection 3.1.1: A sequential analysis of variance table
anova(allbacks.lm)
Omission of the intercept term
## Show rows 1, 7, 8 and 15 only
model.matrix(allbacks.lm)[c(1,7,8,15), ]
## NB, also, code that returns the data frame used
model.frame(allbacks.lm)
<- lm(weight ~ -1+volume+area, data=allbacks)
allbacks.lm0 print(coef(summary(allbacks.lm0)), digits=2)
## Correlations between estimates -- no intercept
print(round(summary(allbacks.lm0, corr=TRUE)$correlation, 3))
Subsection 3.1.2: Diagnostic plots
<- lm(weight ~ -1+volume+area, data=allbacks)
allbacks.lm0 plot(allbacks.lm0, caption=c('A: Resids vs Fitted', 'B: Normal Q-Q',
'C: Scale-Location', '', 'D: Resids vs Leverage'), cex.caption=0.85,
fg='gray')
## To show all plots in the one row, precede with
par(mfrow=c(1,4)) # Follow with par(mfrow=c(1,1))
## The following has the default captions
plot(allbacks.lm0)
<- lm(weight ~ -1+volume+area, data=allbacks[-13, ])
allbacks.lm13 print(coef(summary(allbacks.lm13)), digits=2)
Section 3.2 The interpretation of model coefficients
Subsection 3.2.1: Times for Northern Irish hill races
<- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar <- within(DAAG::nihills, {mph <- dist/time; gradient <- climb/dist})
nihr <- nihr[, c("time", "dist", "climb", "gradient", "mph")]
nihr <- c("\ntime\n(hours)","\ndist\n(miles)","\nclimb\n(feet)",
varLabs "\ngradient\n(ft/mi)", "\nmph\n(mph)")
<- list(col.smooth='red', lty.smooth=2, lwd.smooth=0.5, spread=0)
smoothPars ::spm(nihr, cex.labels=1.2, regLine=FALSE, col='blue',
caroma=c(1.95,3,4, 3), gap=.25, var.labels=varLabs, smooth=smoothPars)
title(main="A: Untransformed scales:", outer=TRUE,
adj=0, line=-1.0, cex.main=1, font.main=1)
## Panel B: Repeat with log(nihills) in place of nihills,
## and with variable labels suitably modified.
<- c("\ntime\n(log h)","\ndist\n(log miles)", "\nclimb\n(log feet)",
varLabs "\ngradient\n(log ft/mi)", "\nmph\n(log mph)")
::spm(log(nihr), regLine=FALSE, col="blue", oma=c(1.95,2.5,4, 2.5),
cargap=.25, var.labels=varLabs, smooth=smoothPars)
title("B: Logarithmic scales", outer=TRUE,
adj=0, line=-1.0, cex.main=1, font.main=1)
par(oldpar)
Subsection 3.2.2: An equation that predicts dist/time
## Hold climb constant at mean on logarithmic scale
<- lm(mph ~ log(dist)+log(climb), data = nihr)
mphClimb.lm ## Hold `gradient=climb/dist` constant at mean on logarithmic scale
<- lm(mph ~ log(dist)+log(gradient), data = nihr)
mphGradient.lm <- mean(nihr$mph)
avRate <- coef(mphClimb.lm)
bClimb <- c(bClimb[1]+bClimb[3]*mean(log(nihr$climb)), bClimb[2])
constCl <- coef(mphGradient.lm)
bGradient <- c(bGradient[1]+bGradient[3]*mean((log(nihr$climb/nihr$dist))),
constSl 2])
bGradient[# Use `dist` and `climb` as explanatory variables
coef(mphClimb.lm)
# Use `dist` and `gradient` as explanatory variables
coef(mphGradient.lm)
<- par(mfrow=c(1,2), mgp=c(2.25,0.5,0), mar=c(3.6,4.1,2.1,1.6))
opar <- c("red", adjustcolor("magenta",0.4))
lineCols <-substitute(paste("Minutes per mile (Add ", ym, ")"), list(ym=round(avRate,2)))
yaxlab::crPlots(mphClimb.lm, terms = . ~ log(dist), xaxt='n',
carxlab="Distance", col.lines=lineCols, ylab=yaxlab)
axis(2, at=4:7, labels=paste(4:7))
<- c(4,8,16,32)
labx axis(1, at=log(2^(2:5)), labels=paste(2^(2:5)))
box(col="white")
mtext("A: Hold climb constant at mean value", adj=0,
line=0.8, at=0.6, cex=1.15)
::crPlots(mphGradient.lm, terms = . ~log(dist), xaxt='n',
carxlab="Distance", col.lines=lineCols, ylab=yaxlab)
axis(1, at=log(2^(2:5)), labels=paste(2^(2:5)))
axis(2, at=4:7, labels=paste(4:7))
box(col="white")
mtext("B: Hold log(gradient) constant at mean", adj=0, line=0.8, at=0.6, cex=1.15)
par(opar)
summary(mphClimb.lm, corr=T)$correlation["log(dist)", "log(climb)"]
summary(mphGradient.lm, corr=T)$correlation["log(dist)", "log(gradient)"]
## Show the plots, with default captions
plot(mphClimb.lm, fg='gray')
plot(mphGradient.lm, caption=c('A: Resids vs Fitted', 'B: Normal Q-Q',
'C: Scale-Location', '', 'D: Resids vs Leverage'),
cex.caption=0.85, fg='gray')
Subsection 3.2.3: Equations that predict log(time)
<- setNames(log(nihr), paste0("log", names(nihr)))
lognihr <- lm(logtime ~ logdist + logclimb, data = lognihr) timeClimb.lm
print(coef(summary(timeClimb.lm)), digits=2)
<- lm(logtime ~ logdist + loggradient, data=lognihr)
timeGradient.lm print(coef(summary(timeGradient.lm)), digits=3)
Subsection 3.2.4: Book dimensions — the oddbooks dataset
<- par(fg='gray40',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar ## Code for Panel A
<- DAAG::oddbooks
oddbooks pairs(log(oddbooks), lower.panel=panel.smooth, upper.panel=panel.smooth,
labels=c("log(thick)", "log(breadth)", "log(height)", "log(weight)"),
gap=0.25, oma=c(1.95,1.95,4, 1.95), col='blue')
title(main="A: Columns from log(oddbooks)",
outer=TRUE, adj=0, line=-1.0, cex.main=1.1, font.main=1)
## Panel B
<-
oddothers with(oddbooks, data.frame(density = weight/(breadth*height*thick),
area = breadth*height, thick=thick, weight=weight))
pairs(log(oddothers), lower.panel=panel.smooth, upper.panel=panel.smooth,
labels=c("log(density)", "log(area)", "log(thick)", "log(weight)"),
gap=0.5, oma=c(1.95,1.95,4, 1.95), col='blue')
title("B: Add density & area; omit breadth & height",
outer=TRUE, adj=0, line=-1.0, cex.main=1.1, font.main=1)
par(oldpar)
<- lm(log(weight) ~ log(thick)+log(breadth)+log(height),
lob3.lm data=oddbooks)
# coef(summary(lob3.lm))
<- lm(log(weight) ~ log(thick)+log(breadth), data=oddbooks)
lob2.lm coef(summary(lob2.lm))
<- lm(log(weight) ~ 1, data=oddbooks)
lob0.lm add1(lob0.lm, scope=~log(breadth) + log(thick) + log(height))
<- update(lob0.lm, formula=. ~ .+log(breadth)) lob1.lm
round(rbind("lob1.lm"=predict(lob1.lm), "lob2.lm"=predict(lob2.lm),
"lob3.lm"=predict(lob3.lm)),2)
<- within(oddbooks, density <- weight/(thick*breadth*height))
oddbooks lm(log(weight) ~ log(density), data=oddbooks) |> summary() |> coef() |>
round(3)
## Code that the reader may care to try
lm(log(weight) ~ log(thick)+log(breadth)+log(height)+log(density),
data=oddbooks) |> summary() |> coef() |> round(3)
Subsection 3.2.5: Mouse brain weight example
<- par(fg='gray40',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar <- DAAG::litters
litters pairs(litters, labels=c("lsize\n\n(litter size)", "bodywt\n\n(Body Weight)",
"brainwt\n\n(Brain Weight)"), gap=0.5, fg='gray',
col="blue", oma=rep(1.95,4))
par(oldpar)
## Regression of brainwt on lsize
summary(lm(brainwt ~ lsize, data = litters), digits=3)$coef
## Regression of brainwt on lsize and bodywt
summary(lm(brainwt ~ lsize + bodywt, data = litters), digits=3)$coef
Subsection 3.2.6: Issues for causal interpretation
Section 3.3 Choosing the model, and checking it out
Subsection 3.3.1 Criteria for moddel choice
Subsection 3.3.2 Plots that show the contribution of individual terms
<- lm((weight) ~ log(thick)+log(height)+log(breadth),
oddbooks.lm data=DAAG::oddbooks)
<- predict(oddbooks.lm, type="terms") yterms
Subsection 3.3.3: A more formal approach to the choice of transformation
## Use car::powerTransform
<- within(DAAG::nihills, {mph <- dist/time; gradient <- climb/dist})
nihr summary(car::powerTransform(nihr[, c("dist", "gradient")]), digits=3)
<- mph ~ log(dist) + log(gradient)
form summary(car::powerTransform(form, data=nihr))
Subsection 3.3.4: Accuracy estimates, fitted values and new observations
<- log(DAAG::nihills)
lognihr names(lognihr) <- paste0("log", names(lognihr))
<- lm(logtime ~ logdist + logclimb, data = lognihr)
timeClimb.lm ## Coverage intervals; use exp() to undo the log transformation
<- exp(predict(timeClimb.lm, interval="confidence"))
citimes ## Prediction intervals, i.e., for new observations
<- exp(predict(timeClimb.lm, newdata=lognihr, interval="prediction"))
pitimes ## fit ci:lwr ci:pwr pi:lwr pi:upr
<- cbind(citimes, pitimes[,2:3])
ci_then_pi colnames(ci_then_pi) <- paste0(c("", rep(c("ci-","pi-"), c(2,2))),
colnames(ci_then_pi))
## First 4 rows
print(ci_then_pi[1:4,], digits=2)
<- update(timeClimb.lm, formula = . ~ . + I(logdist^2))
timeClimb2.lm .10 <-
g3function(model1=timeClimb.lm, model2=timeClimb2.lm)
{## Panel A
<- predict(model1, interval="confidence")
citimes <- order(citimes[,"fit"])
ord <- citimes[ord,]
citimes <- citimes[,"fit"]
hat <- predict(model1, newdata=lognihr, interval="prediction")[ord,]
pitimes <- log(nihr[ord,"time"])
logobs <- pretty(exp(hat))
xtiks <- range(c(pitimes[,"lwr"], pitimes[,"upr"], logobs)-rep(hat,3))
ylim <- pretty(ylim,5)
logytiks <- round(exp(logytiks),2)
ytiks <- range(hat)
xlim plot(hat, citimes[,"lwr"]-hat, type="n", xlab = "Time (fitted)",
ylab = "Difference from fit",
xlim=xlim, ylim = ylim, xaxt="n", yaxt="n", fg="gray")
mtext(side=3, line=0.75, adj=0, at=-2.0, "A: CIs and PIs: Mean, prediction")
mtext(side=4, line=1.25, "exp(Difference from fit)", las=0)
axis(1, at=log(xtiks), labels=paste(xtiks), lwd=0, lwd.ticks=1)
axis(2, at=logytiks, las=1, lwd=0, lwd.ticks=1)
axis(4, at=logytiks, labels=paste(ytiks), las=0, lwd=0, lwd.ticks=1)
points(hat, logobs-hat, pch=16, cex=0.65)
lines(hat, citimes[,"lwr"]-hat, col = "red")
lines(hat, citimes[,"upr"]-hat, col = "red")
lines(hat, pitimes[,"lwr"]-hat, col = "black")
lines(hat, pitimes[,"upr"]-hat, col = "black")
## Panel B
<- predict(model2, interval="confidence")[ord,]
citimes2 plot(hat, citimes[,"lwr"]-hat, type="n", xlab = "Time (fitted)",
ylab = "Difference from fit",
xlim=xlim, ylim = ylim, xaxt="n", yaxt="n", fg="gray")
mtext(side=3, line=0.75, adj=0, at=-2.0,
"B: CIs for fit, compare two models")
mtext(side=4, line=1.25, "exp(Difference from fit)", las=0)
axis(1, at=log(xtiks), labels=paste(xtiks), lwd=0, lwd.ticks=1)
axis(2, at=logytiks,las=1, lwd=0, lwd.ticks=1)
axis(4, at=logytiks, labels=paste(ytiks), las=0,, lwd=0, lwd.ticks=1)
points(hat, logobs-hat, pch=16, cex=0.65)
lines(hat, citimes[,"lwr"]-hat, col = "red")
lines(hat, citimes[,"upr"]-hat, col = "red")
<- citimes2[,"fit"]
hat2 lines(hat, citimes2[,"lwr"]-hat2, col = "blue", lty=2, lwd=1.5)
lines(hat, citimes2[,"upr"]-hat2, col = "blue", lty=2, lwd=1.5)
}
<- update(timeClimb.lm, formula = . ~ . + I(logdist^2)) timeClimb2.lm
Subsection 3.3.5: Choosing the model — deaths from Atlantic hurricanes
<- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar <- DAAG::hurricNamed[,c("LF.PressureMB", "BaseDam2014", "deaths")]
hurric <- car::powerTransform(hurric, family="yjPower")
thurric <- car::yjPower(hurric, coef(thurric, round=TRUE))
transY <- list(col.smooth='red', lty.smooth=2, lwd.smooth=1, spread=0)
smoothPars ::spm(transY, lwd=0.5, regLine=FALSE, oma=rep(2.5,4), gap=0.5,
carcol="blue", smooth=smoothPars, cex.labels=1)
par(oldpar)
<- deaths ~ log(BaseDam2014) + LF.PressureMB
modelform <- car::powerTransform(modelform, data=as.data.frame(hurric),
powerT family="yjPower")
summary(powerT, digits=3)
<- with(hurric, car::yjPower(deaths, lambda=-0.2))
deathP <- MASS::rlm(deathP ~ log(BaseDam2014) + LF.PressureMB, data=hurric)
power.lm print(coef(summary(power.lm)),digits=2)
## Use (deaths+1)^(-0.2) as outcome variable
plot(power.lm, cex.caption=0.85, fg="gray",
caption=list('A: Resids vs Fitted', 'B: Normal Q-Q', 'C: Scale-Location', '',
'D: Resids vs Leverage'))
Subsection 3.3.6: Strategies for fitting models — suggested steps
Section 3.4 Robust regression, outliers, and influence
Subsection 3.4.1: Making outliers obvious — robust regression
<- DAAG::hills2000[,c("dist", "climb", "time")]
hills2000 <- c("\ndist\n(log miles)", "\nclimb\n(log feet)", "\ntime\n(log hours)")
varLabels <- list(col.smooth='red', lty.smooth=2, lwd.smooth=1, spread=0)
smoothPars <- DAAG::hills2000[,c("dist", "climb", "time")]
hills2000 <- c("\ndist\n(log miles)", "\nclimb\n(log feet)", "\ntime\n(log hours)")
varLabels ::spm(log(hills2000), smooth=smoothPars, regLine=FALSE, cex.labels=1.5,
carvar.labels = varLabels, lwd=0.5, gap=0.5, oma=c(1.95,1.95,1.95,1.95))
## Panel A
<- lm(log(time) ~ log(climb) + log(dist), data = hills2000)
lhills2k.lm plot(lhills2k.lm, caption="", which=1, fg="gray", col=adjustcolor("black", alpha=0.8))
mtext(side=3, line=0.75, "A: Least squares (lm) fit", adj=0, cex=1.1)
## Panel B
<- MASS::lqs(log(time) ~ log(climb) + log(dist), data = hills2000)
lhills2k.lqs <- residuals(lhills2k.lqs)
reres <- fitted(lhills2k.lqs)
refit <- which(abs(reres) >= sort(abs(reres), decreasing=TRUE)[3])
big3 plot(reres ~ refit, xlab="Fitted values (resistant fit)",
ylab="Residuals (resistant fit)", col=adjustcolor("black", alpha=0.8), fg="gray")
lines(lowess(reres ~ refit), col=2)
text(reres[big3] ~ refit[big3], labels=rownames(hills2000)[big3],
pos=4-2*(refit[big3] > mean(refit)), cex=0.8)
mtext(side=3, line=0.75, "B: Resistant (lqs) fit", adj=0, cex=1.1)
## Show only the 2nd diognostic plot, i.e., a normal Q-Q plot
## plot(lhills2k.lm, which=2)
Subsection 3.4.2: Leverage, influence, and Cook’s distance
\(^*\) Leverage and the hat matrix — technical details
round(unname(hatvalues(timeClimb.lm)),2)
Dynamic graphics
## Residuals versus leverages
<- DAAG::nihills
nihills <- lm(log(time) ~ log(dist) + log(climb), data = nihills)
timeClimb.lm plot(timeClimb.lm, which=5, add.smooth=FALSE, ps=9, sub.caption="",
cex.caption=1.1, fg="gray")
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))
## Residuals versus leverages
plot(timeClimb.lm, which=5, add.smooth=FALSE)
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))
## This code is designed to be evaluated separately from other chunks
with(nihills, scatter3d(x=log(dist), y=log(climb), z=log(time), grid=FALSE,
point.col="black", surface.col="gray60",
surface.alpha=0.2, axis.scales=FALSE))
with(nihills, Identify3d(x=log(dist), y=log(climb), z=log(time),
labels=row.names(DAAG::nihills), minlength=8), offset=0.05)
## To rotate display, hold down the left mouse button and move the mouse.
## To put labels on points, right-click and drag a box around them, perhaps
## repeatedly. Create an empty box to exit from point identification mode.
Influence on the regression coefficients
## Residuals versus leverages
<- DAAG::nihills
nihills <- lm(log(time) ~ log(dist) + log(climb), data = nihills)
timeClimb.lm plot(timeClimb.lm, which=5, add.smooth=FALSE, ps=9, sub.caption="",
cex.caption=1.1, fg="gray")
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))
*Additional diagnostic plots
## As an indication of what is available, try
::influencePlot(allbacks.lm) car
Section 3.5 Assessment and comparison of regression models
Subsection 3.5.1: *AIC, AICc, BIC, and Bayes Factors for normal theory regression models
## Calculations using mouse brain weight data
<- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse.lm <- update(mouse.lm, formula = . ~ . - lsize) mouse0.lm
<- sapply(list(mouse0.lm, mouse.lm), AICcmodavg::AICc)
aicc <- cbind(AIC(mouse0.lm, mouse.lm), AICc=aicc,
infstats BIC=BIC(mouse0.lm, mouse.lm)[,-1])
print(rbind(infstats, "Difference"=apply(infstats,2,diff)), digits=3)
library(lattice)
<- data.frame(n=5:35, AIC=rep(2,31), BIC=log(5:35))
df <- function(n,p,d) 2*(p+d)*n/(n-(p+d)-1) - 2*p*n/(n-p-1)
cfAICc <- cbind(df, AICc12=cfAICc(5:35,1,1), AICc34=cfAICc(5:35,3,1))
df <- sort(c(2^(0:6),2^(0:6)*1.5))
labs xyplot(AICc12+AICc34+AIC+BIC ~ n, data=df, type='l', auto.key=list(columns=4),
scales=list(y=list(log=T, at=labs, labels=paste(labs))),
par.settings=simpleTheme(lty=c(1,1:3), lwd=2, col=rep(c('gray','black'), c(1,3))))
The functions drop1() and add1()
## Obtain AIC or BIC using `drop1()` or `add1()`
<- nrow(DAAG::litters)
n drop1(mouse.lm, scope=~lsize) # AIC, with/without `lsize`
drop1(mouse.lm, scope=~lsize, k=log(n)) # BIC, w/wo `lsize`
add1(mouse0.lm, scope=~bodywt+lsize) # AIC, w/wo `lsize`, alternative
The use of Bayesfactor::lmBF to compare the two models
suppressPackageStartupMessages(library(BayesFactor))
<- lmBF(brainwt ~ bodywt, data=DAAG::litters)
bf1 <- lmBF(brainwt ~ bodywt+lsize, data=DAAG::litters)
bf2 /bf1 bf2
## Relative support statistics
setNames(exp(-apply(infstats[,-1],2,diff)/2), c("AIC","AICc","BIC"))
Subsection 3.5.2: Using anova() to compare models — the ihills data
<- log(DAAG::nihills)
lognihr <- setNames(log(nihr), paste0("log", names(nihr)))
lognihr <- lm(logtime ~ logdist + logclimb, data = lognihr)
timeClimb.lm <- update(timeClimb.lm, formula = . ~ . + I(logdist^2))
timeClimb2.lm print(anova(timeClimb.lm, timeClimb2.lm, test="F"), digits=4)
print(anova(timeClimb.lm, timeClimb2.lm, test="Cp"), digits=3)
## Compare with the AICc difference
sapply(list(timeClimb.lm, timeClimb2.lm), AICcmodavg::AICc)
<- update(formula(timeClimb.lm), ~ . + I(logdist^2) + logdist:logclimb)
form1 <- add1(timeClimb.lm, scope=form1, test="F")
addcheck print(addcheck, digits=4)
Subsection 3.5.3: Training/test approaches, and cross-validation
## Check how well timeClimb.lm model predicts for hills2000 data
<- lm(logtime ~ logdist + logclimb, data = lognihr)
timeClimb.lm <- log(subset(DAAG::hills2000,
logscot !row.names(DAAG::hills2000)=="Caerketton"))
names(logscot) <- paste0("log", names(hills2000))
<- predict(timeClimb.lm, newdata=logscot, se=TRUE)
scotpred <- summary(timeClimb.lm)[["sigma"]]^2
trainVar <- summary(timeClimb.lm)[["df"]][2]
trainDF <- mean((logscot[,'logtime']-scotpred[['fit']])^2)
mspe <- nrow(logscot) mspeDF
pf(mspe/trainVar, mspeDF, trainDF, lower.tail=FALSE)
<- lm(logtime ~ logdist+logclimb, data=logscot)
scot.lm signif(summary(scot.lm)[['sigma']]^2, 4)
Subsection 3.5.4: Further points and issues
Patterns in the diagnostic plots – are they more than hints?
{r 3_18, eval=F|
Section 3.6 Problems with many explanatory variables
Subsection 3.6.1: Variable selection issues
Variable selection – a simulation with random data
<- rnorm(100)
y ## Generate a 100 by 40 matrix of random normal data
<- matrix(rnorm(4000), ncol = 40)
xx dimnames(xx)<- list(NULL, paste("X",1:40, sep=""))
## ## Find the best fitting model. (The 'leaps' package must be installed.)
<- leaps::regsubsets(xx, y, method = "exhaustive", nvmax = 3, nbest = 1)
xx.subsets <- summary(xx.subsets)$which[3,-1]
subvar <- lm(y ~ -1+xx[, subvar])
best3.lm print(summary(best3.lm, corr = FALSE))
## DAAG::bestsetNoise(m=100, n=40)
<- capture.output(DAAG::bestsetNoise(m=100, n=40))
best3 cat(best3[9:14], sep='\n')
The extent of selection effects – a detailed simulation:
<- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar set.seed(41)
library(splines)
::bsnVaryNvar(nvmax=3, nvar = 3:35, xlab="")
DAAGmtext(side=1, line=1.75, "Number selected from")
Subsection 3.6.2: Multicollinearity
An example – compositional data
data(Coxite, package="compositions") # Places Coxite in the workspace
# NB: Proceed thus because `Coxite` is not exported from `compositions`
<- as.data.frame(Coxite) coxite
<- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20', tcl=-0.25)
oldpar <- function(x, y, digits = 3, prefix = "", cex.cor=0.8, ...)
panel.cor
{<- par(usr = c(0, 1, 0, 1)); on.exit(par(old.par))
old.par <- abs(cor(x, y))
r <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
txt if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * sqrt(r))
}pairs(coxite, gap=0.4, col=adjustcolor("blue", alpha=0.9), upper.panel=panel.cor)
par(oldpar)
<- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
coxiteAll.lm print(coef(summary(coxiteAll.lm)), digits=2)
<- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
coxiteAll.lm <- predict(coxiteAll.lm, interval="confidence")
coxite.hat <- coxite.hat[,"fit"]
hat plot(porosity ~ hat, data=coxite, fg="gray", type="n", xlab="Fitted values",
ylab="Fitted values, with 95% CIs\n(Points are observed porosities)",
tcl=-0.35)
with(coxite, points(porosity ~ hat, cex=0.75, col="gray45"))
lines(hat, hat, lwd=0.75)
<- order(hat)
ord <- function(x, y1, y2, eps=0.15, lwd=0.75){
sebar lines(rep(x,2), c(y1,y2), lwd=lwd)
lines(c(x-eps,x+eps), rep(y1,2), lwd=lwd)
lines(c(x-eps,x+eps), rep(y2,2), lwd=lwd)
}<- ord[round(quantile(1:length(hat), (1:9)/10))]
q for(i in q)sebar(hat[i], coxite.hat[i,"lwr"], coxite.hat[i,"upr"])
<- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
coxiteAll.lm <- predict(coxiteAll.lm, interval="confidence")
coxite.hat <- coxite.hat[,"fit"] hat
## Pointwise confidence bounds can be obtained thus:
<- predict(coxiteAll.lm, interval="confidence", level=0.95) hat
Subsection 3.6.3: The variance inflation factor (VIF)
print(DAAG::vif(lm(porosity ~ A+B+C+D+depth, data=coxite)), digits=2)
<- leaps::regsubsets(porosity ~ ., data=coxite, nvmax=4, method='exhaustive')
b ## The calculation fails for nvmax=5
<- summary(b)[["which"]]
inOut ## Extract and print the coefficents for the four regressions
<- list(rep("",4),c("Intercept", colnames(coxite)[-7]))
dimnam <- matrix(nrow=4, ncol=7, dimnames=dimnam)
cmat for(i in 1:4)cmat[i,inOut[i,]] <- signif(coef(b,id=1:4)[[i]],3)
<- cbind(cmat," "=rep(NA,4),
outMat as.matrix(as.data.frame(summary(b)[c("adjr2", "cp", "bic")])))
print(signif(outMat,3),na.print="")
<- lm(porosity ~ B+C, data=coxite)
BC.lm print(signif(coef(summary(BC.lm)), digits=3))
::vif(BC.lm) car
## Diagnostic plots can be checked thus:
plot(BC.lm, eval=xtras)
Numbers that do not quite add up
<- coxite
coxiteR 1:5] <- round(coxiteR[, 1:5])
coxiteR[, <- lm(porosity ~ ., data=coxiteR)
coxiteR.lm print(coef(summary(coxiteR.lm)), digits=2)
print(DAAG::vif(lm(porosity ~ .-E, data=coxiteR)), digits=2)
Section 3.7 Errors in x
Simulations of the effect of measurement error
<- DAAG::errorsINx(gpdiff=0, plotit=FALSE, timesSDx=(1:4)/2,
gph layout=c(5,1), print.summary=FALSE)[["gph"]]
<- DAAG::DAAGtheme(color=FALSE, alpha=0.6, lwd=2,
parset col.points=c("gray50","black"),
col.line=c("gray50","black"), lty=1:2)
update(gph, par.settings=parset)
Two explanatory variables, one measured without error – a simulation
<- DAAG::errorsINx(gpdiff=1.5, timesSDx=(1:2)*0.8, layout=c(3,1),
gph print.summary=FALSE, plotit=FALSE)[["gph"]]
<- DAAG::DAAGtheme(color=FALSE, alpha=0.6, lwd=2,
parset col.points=c("gray50","black"),
col.line=c("gray50","black"), lty=1:2)
update(gph, par.settings=parset)
Section 3.8 Multiple regression models – additional points
coef(lm(area ~ volume + weight, data=allbacks))
<- as.vector(coef(lm(weight ~ volume + area, data=allbacks)))
b c("_Intercept_"=-b[1]/b[3], volume=-b[2]/b[3], weight=1/b[3])
Subsection 3.8.2: Missing explanatory variables
<- DAAG::gaba
gaba <- stack(gaba["30", -match('min', colnames(gaba))])
gabalong $sex <- factor(rep(c("male", "female","all"), rep(2,3)),
gabalonglevels=c("female","male","all"))
$treatment <- factor(rep(c("Baclofen","No baclofen"), 3),
gabalonglevels=c("No baclofen","Baclofen"))
<- lattice::stripplot(sex~values, groups=treatment, data=gabalong,
gph panel=function(x,y,...){
::panel.stripplot(x,y,...)
lattice::ltext(x,y,paste(c(3,9,15,7,22,12)), pos=1, cex=0.8)
latticeauto.key=list(space="right", points=TRUE, cex=0.8))
}, <- list(fontsize=list(text=9, points=5),
bw9 cex=c(1.5,1.5), pch=c(1,16))
update(gph, par.settings=parset,
xlab=list("Average reduction: 30 min vs 0 min", cex=1.0),
scales=list(cex=1.0, tck=0.35))
Subsection 3.8.3: Added variable plots
<- lm(logtime ~ logclimb, data=lognihr)
yONx.lm <- resid(yONx.lm)
e_yONx print(coef(yONx.lm), digits=4)
<- lm(logdist ~ logclimb, data=lognihr)
zONx.lm <- resid(zONx.lm)
e_zONx print(coef(yONx.lm), digits=4)
<- lm(e_yONx ~ 0+e_zONx)
ey_xONez_x.lm <- resid(ey_xONez_x.lm)
e_yONxz print(coef(ey_xONez_x.lm), digits=4)
<- par(fg='gray')
oldpar ## Code for added variable plots
<- lm(logtime ~ logclimb+logdist, data=lognihr)
logtime.lm ::avPlots(logtime.lm, lwd=1, terms="logdist", fg="gray")
carmtext(side=3, line=0.5, "A: Added var: 'logdist'", col="black", adj=0, cex=1.15)
::avPlots(logtime.lm, lwd=1, terms="logclimb", fg="gray")
carmtext(side=3, line=0.5, "B: Added var: 'logclimb'", col="black", adj=0, cex=1.15)
par(oldpar)
## One call to show both plots
::avPlots(timeClimb.lm, terms=~.) car
## Alternative code for first plot
plot(e_yONx ~ e_zONx)
plot(yONx.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "A: From 'logtime' on 'logclimb'", adj=0, cex=0.85)
plot(zONx.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "B: From 'logdist' on 'logclimb'", adj=0, cex=0.85)
plot(ey_xONez_x.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "C: From AVP", adj=-0, cex=0.85)
*Algebraic details
<- coef(yONx.lm)
ab1 <- coef(zONx.lm)
ab2 <- coef(ey_xONez_x.lm)
b2 <- ab1[2] - b2*ab2[2]
b1 <- ab1[1] - b2*ab2[1] a
coef(lm(logtime ~ logclimb + logdist, data=lognihr))
Subsection 3.8.4: Nonlinear methods – an alternative to transformation?
$climb.mi <- nihr$climb/5280
nihr<- nls(time ~ (dist^alpha)*(climb.mi^beta), start =
nihr.nls0 c(alpha = 0.68, beta = 0.465), data = nihr)
## plot(residuals(nihr.nls0) ~ log(predict(nihr.nls0)))
signif(coef(summary(nihr.nls0)),3)
<- nls(time ~ gamma + delta1*dist^alpha + delta2*climb.mi^beta,
nihr.nls start=c(gamma = .045, delta1 = .09, alpha = 1,
delta2=.9, beta = 1.65), data=nihr)
## plot(residuals(nihr.nls) ~ log(predict(nihr.nls)))
signif(coef(summary(nihr.nls)),3)
Section 3.9: Recap
Section 3.10: Further reading
Exercises (3.11)
3.1
## ## Set up factor that identifies the `have' cities
<- DAAG::cities
cities $have <- with(cities, factor(REGION %in% c("ON","WEST"),
citieslabels=c("Have-not","Have")))
<- lattice::xyplot(POP1996~POP1992, groups=have, data=cities,
gphA auto.key=list(columns=2))
<-lattice::xyplot(log(POP1996)~log(POP1992), groups=have, data=cities,
gphBauto.key=list(columns=2))
print(gphA, split=c(1,1,2,1), more=TRUE)
print(gphB, split=c(2,1,2,1))
<- lm(POP1996 ~ have+POP1992, data=cities)
cities.lm1 <- lm(log(POP1996) ~ have+log(POP1992), data=cities) cities.lm2
3.8a
<- lm(time ~ dist+climb, data=DAAG::nihills)
nihills.lm <- lm(time ~ dist+climb+dist:climb, data=DAAG::nihills)
nihillsX.lm anova(nihills.lm, nihillsX.lm) # Use `anova()` to make the comparison
coef(summary(nihillsX.lm)) # Check coefficient for interaction term
drop1(nihillsX.lm)
3.11
log(time) ~ log(dist) + log(climb) ## lm model
~ alpha*dist + beta*I(climb^2) ## nls model time
3.13
<- runif(10) # predictor which will be missing
x1 <- rbinom(10, 1, 1-x1)
x2 ## observed predictor, depends on missing predictor
<- 5*x1 + x2 + rnorm(10,sd=.1) # simulated model; coef of x2 is positive
y <- lm(y ~ factor(x2)) # model fitted to observed data
y.lm coef(y.lm)
<- lm(y ~ x1 + factor(x2)) # correct model
y.lm2 coef(y.lm2)
3.16
<- DAAG::bomregions2021
bomData <- MASS::lqs(northRain ~ SOI + CO2, data=bomData)
nraw.lqs <- MASS::lqs(I(northRain^(1/3)) ~ SOI + CO2, data=bomData)
north.lqs plot(residuals(nraw.lqs) ~ Year, data=bomData)
plot(residuals(north.lqs) ~ Year, data=bomData)
3.17f
<- subset(DAAG::repPsych, Discipline=='Social')
socpsych with(socpsych, scatter.smooth(T_r.R~T_r.O))
abline(v=.5)
<- MASS::rlm(T_r.R~T_r.O, data=subset(socpsych, T_r.O<=0.5))
soc.rlm ## Look at summary statistics
termplot(soc.rlm, partial.resid=T, se=T)
plot(soc.rlm)
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch3.R")
}