4 Exploiting the linear model framework
Packages required (with dependencies)
DAAG effects mgcv splines scam MASS latticeExtra car WDI AICcmodavg ggplot2 kableExtra qgam patchwork
Additionally, Hmisc and knitr are required in order to process the Rmd source file.
Note the use of the ‘patchwork’ package to make it easy to place two ggplot2 plots side by side.
::knitrSet(basename="exploit", lang='markdown', fig.path="figs/g", w=7, h=7)
Hmisc<- options(digits=4, width=70, scipen=999)
oldopt library(knitr)
## knitr::render_listings()
'set']](cache.path='cache-', out.width="80%", fig.align="center",
opts_chunk[[fig.show='hold', formatR.arrow=FALSE, ps=10,
strip.white = TRUE, comment=NA, width=70,
tidy.opts = list(replace.assign=FALSE))
Section 4.1 Levels of a factor – using indicator variables
Subsection 4.1.1: Example – sugar weight
<- DAAG::sugar # Copy dataset 'sugar' into the workspace
sugar ## Ensure that "Control" is the first level
"trt"]] <- relevel(sugar[["trt"]], ref="Control")
sugar[[options()[["contrasts"]] # Check the default factor contrasts
## If your output does not agree with the above, then enter
## options(contrasts=c("contr.treatment", "contr.poly"))
<- aov(weight ~ trt, data=sugar)
sugar.aov ## To display the model matrix, enter: model.matrix(sugar.aov)
## Note the use of summary.lm(), not summary() or summary.aov()
round(signif(coef(summary.lm(sugar.aov)), 3), 4)
<- summary.lm(sugar.aov)$sigma/sqrt(3) # 3 results/trt
sem # Alternatively, sem <- 6.33/sqrt(2)
qtukey(p=.95, nmeans=4, df=8) * sem
Subsection 4.1.2: Different choices for the model matrix when there are factors
contrasts(sugar$trt) <- 'contr.sum'
<- aov(weight ~ trt, data = sugar)
sugarSum.aov round(signif(coef(summary.lm(sugarSum.aov)), 3),4)
dummy.coef(sugarSum.aov)
Factor contrasts – further details
contrasts(sugar$trt) <- "contr.sum"
<- factor(1:3, labels=c("Trout","Cod","Perch")) fish
contr.treatment(fish)
# Base is "Trout"
contr.SAS(fish)
# Base is "Perch"
contr.sum(fish)
# Base is mean of levels
Section 4.2 Block designs and balanced incomplete block designs
Subsection 4.2.1: Analysis of the rice data, allowing for block effects
<- DAAG::rice
rice <- aov(ShootDryMass ~ Block + variety * fert, data=rice)
ricebl.aov print(summary(ricebl.aov), digits=3)
round(signif(coef(summary.lm(ricebl.aov)), 3), 5)
with(summary.lm(ricebl.aov),
cat("Residual standard error: ", sigma, "on", df[2], "degrees of freedom"))
## AOV calculations, ignoring block effects
<- aov(ShootDryMass ~ variety * fert, data=rice)
rice.aov summary.lm(rice.aov)$sigma
<- aov(ShootDryMass ~ factor(Block) + variety * fert, data=rice) ricebl.aov
model.tables(ricebl.aov, type="means", se=TRUE, cterms="variety:fert")
Subsection 4.2.2: A balanced incomplete block design
<- DAAG::appletaste
appletaste with(appletaste, table(product, panelist))
sapply(appletaste, is.factor) # panelist & product are factors
<- aov(aftertaste ~ product + panelist, data=appletaste)
appletaste.aov summary(appletaste.aov)
as.data.frame(effects::Effect("product", appletaste.aov, confidence.level=0.95))
## NB that 'product' was first term in the model formula
## Thus, the 1st 4 coefficients have the information required
coef(summary.lm(appletaste.aov))[1:4, ]
Section 4.3 Fitting multiple lines
## Fit various models to columns of data frame leaftemp (DAAG)
<- DAAG::leaftemp
leaftemp <- lm(tempDiff ~ 1 , data = leaftemp)
leaf.lm1 <- lm(tempDiff ~ vapPress, data = leaftemp)
leaf.lm2 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp)
leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress +
leaf.lm4 :CO2level, data = leaftemp) vapPress
anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4)
print(coef(summary(leaf.lm3)), digits=2)
Section 4.4 Methods for fitting smooth curves
Subsection 4.4.1: Polynomial Regression
<- DAAG::seedrates
seedrates <- grain ~ rate + I(rate^2)
form2 # Without the wrapper function I(), rate^2 would be interpreted
# as the model formula term rate:rate, and hence as rate.
<- lm(form2, data = seedrates)
quad.lm2 ## Alternative, using gam()
## quad.gam <- mgcv::gam(form2, data = seedrates)
suppressPackageStartupMessages(library(ggplot2))
## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
<- ggplot(DAAG::seedrates, aes(rate,grain))+
gph geom_point(aes(size=3), color='magenta')+xlim(c(25,185))
<- c("Linear"="blue", "Quadratic"="red")
colors <- ggplot_build(gph+geom_smooth(aes(rate,grain,color="Linear"),
ggdat method=lm, formula=y~poly(x,2),fullrange=TRUE))$data[[2]]
<- gph+geom_smooth(aes(color="Linear"), method=lm, formula=y~x, fullrange=TRUE, fill='dodgerblue')
gph1 + geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
gph1 linewidth=0.75)+
geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax,
color="Quadratic"), linewidth=0.75,
fill=NA, linetype=2, outline.type='both', show.legend=FALSE) +
scale_color_manual(values=colors, aesthetics = "color")+
theme(legend.position=c(.8,.78)) +
coord_cartesian(expand=FALSE) + xlab("Seeding rate (kg/ha)") +
ylab("Grains per head") + labs(color="Model") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) )
## detach("package:ggplot2")
<- lm(grain ~ rate + I(rate^2), data = DAAG::seedrates)
quad.lm2 print(coef(summary(quad.lm2)), digits=2)
cat("\nCorrelation matrix\n")
print(summary(quad.lm2, corr=TRUE)$correlation, digits=2)
*An alternative formulation using orthogonal polynomials
<- lm(grain ~ poly(rate,2), data = seedrates)
seedratesP.lm2 print(coef(summary(seedratesP.lm2)), digits=2)
## Alternative, using mgcv::gam()
<- mgcv::gam(grain ~ poly(rate,2), data = seedrates) seedratesP.gam
<- lm(log(grain) ~ log(rate), data=DAAG::seedrates)
logseed.lm coef(summary(logseed.lm))
## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
<- ggplot(DAAG::seedrates, aes(rate,grain)) +
gph geom_point(size=3, color="magenta")+xlim(c(25,185))
<- c("Loglinear"="gray40", "Quadratic"="red")
colors <- ggplot_build(gph+geom_smooth(method=lm, formula=y~poly(x,2),
ggdat fullrange=TRUE))$data[[2]]
<- ggplot_build(gph+geom_smooth(method=lm,
ggln formula=log(y)~log(x),fullrange=TRUE))$data[[2]]
## Assign to gphA rather than (as in text) plotting at this point
<- gph + geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
gphA linewidth=0.75) +
geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax, color="Quadratic"),
linewidth=0.75, fill=NA, linetype=2, outline.type='both',
show.legend=FALSE) +
geom_line(data = ggln, aes(x = x, y = exp(y), color="Loglinear"),
linewidth = 0.75) +
geom_ribbon(data=ggln, aes(x=x,y=exp(y), ymin=exp(ymin), ymax=exp(ymax),
color="Loglinear"), fill=NA, linewidth=0.75, linetype=3,
outline.type='both', show.legend=FALSE)+
scale_color_manual(values=colors, aesthetics = "color")+
coord_cartesian(expand=FALSE) +
xlab("Seeding rate (kg/ha)") + ylab("Grains per head") +
labs(title="A: Loglinear fit vs quadratic fit", color="Model") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) ) +
theme(legend.position=c(.8,.78))
<- data.frame(rate=rep(DAAG::seedrates$rate,2), res=c(resid(logseed.lm),
df log(DAAG::seedrates$grain)-log(fitted(quad.lm2))),
Model=rep(c("Loglinear","Quadratic"),rep(nrow(DAAG::seedrates),2)))
## Assign to gphB rather than (as in text) plotting at this point
<- ggplot(df, aes(x=rate, y=res, shape=Model,color=Model))+
gphB geom_point(size=2.5) + scale_color_manual(values=colors) +
xlab("Seeding rate (kg/ha)") + ylab("Residuals on log scale") +
labs(title="B: Residuals") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) ) +
theme(legend.position=c(.8,.78))
## Now take advantage of the magic of the 'patchwork' package
library(patchwork)
+gphB
gphA## detach("package:ggplot2")
<- AIC(quad.lm2, logseed.lm)
aic "logseed.lm",2] <- aic["logseed.lm",2] + sum(2*log(seedrates$grain))
aic[round(aic,1)
<-DAAG::seedrates
seedrates<- lm(grain ~ poly(rate,degree=2), data=seedrates)
quad.lm2 <- lm(grain ~ splines::ns(rate,df=2), data=seedrates)
ns.lm2 <- mgcv::gam(grain ~ s(rate, k=3, fx=T), data=seedrates) tps.gam2
<- lapply(list(quad=quad.lm2, nsplines=ns.lm2, tps=tps.gam2), model.matrix)
mflist <- with(mflist, cbind(quad, nsplines, tps))
mftab colnames(mftab) <- c("(Int)", "poly2.1", "poly2.2", "(Int)", "ns2.1", "ns2.2", "(Int)", "s3.1", "s3.2")
library(kableExtra)
= c('', '', '', '\\addlinespace')
linesep kbl(mftab, booktabs=TRUE, format='latex', toprule=FALSE,
format.args=list(justify="right", width=8)) |>
kable_styling(latex_options = c("scale_down",latex_options = "hold_position"), position='center') |>
add_header_above(c('poly(rate,2)' = 3, 'splines::ns(rate,df=2)' = 3, 's(rate, k=3, fx=T)' = 3),
align='c', monospace=rep(T,3))|>
add_header_above(c('lm: grain~' = 3, 'lm: grain~'=3, 'gam: grain~'=3),
align='c', monospace=rep(T,3), line=F)
Alternative fits – what is the best choice?
## Load required packages
suppressPackageStartupMessages(library(splines))
suppressPackageStartupMessages(library(mgcv))
<- gam(kohms~s(juice, bs="tp"), data=fruitohms)
ohms.tp <- gam(kohms~s(juice, bs="cs"), data=fruitohms)
ohms.cs range(fitted(ohms.tp)-fitted(ohms.cs))
summary(ohms.tp)
summary(ohms.tpBIC)
Subsection 4.4.3: The contributions of basis curves to the fit
Subsection 4.4.4: Checks on the fitted model
## Printed output from `gam.check(ohms.tpBIC)`
cat(out, sep="\n")
Subsection 4.3.5: Monotone curves
<- scam::scam(kohms ~ s(juice,bs="mpd"), data=fruitohms)
ohms.scam summary(ohms.scam)
AIC(ohms.scam, ohms.tp)
BIC(ohms.scam, ohms.tp)
Subsection 4.4.6: Different smooths for different levels of a factor
<- MASS::whiteside
whiteside <- gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside) gas.gam
summary(gas.gam)
Box.test(resid(gas.gam)[whiteside$Insul=='Before'], lag=1)
Box.test(resid(gas.gam)[whiteside$Insul=='After'], lag=1)
Subsection 4.4.8: Multiple spline smoothing terms — dewpoint data
## GAM model -- `dewpoint` data
<- DAAG::dewpoint
dewpoint <- gam(dewpt ~ s(mintemp) + s(maxtemp), data=dewpoint)
ds.gam plot(ds.gam, resid=TRUE, pch=".", se=2, cex=2, fg="gray")
Using residuals as a check for non-additive effects
library(lattice)
## Residuals vs maxtemp, for different mintemp ranges
<- equal.count(dewpoint$mintemp, number=3)
mintempRange <- xyplot(residuals(ds.gam) ~ maxtemp|mintempRange, data=dewpoint,
ds.xy layout=c(3,1), scales=list(tck=0.5), aspect=1, cex=0.65,
par.strip.text=list(cex=0.75), type=c("p","smooth"),
xlab="Maximum temperature", ylab="Residual")
ds.xy
*A smooth surface
## Fit surface
<- gam(dewpt ~ s(mintemp, maxtemp), data=DAAG::dewpoint)
ds.tp vis.gam(ds.tp, plot.type="contour") # gives a contour plot of the
# fitted regression surface
vis.gam(ds.gam, plot.type="contour") # cf, model with 2 smooth terms
Subsection 4.4.9: Atlantic hurricanes that made landfall in the US
<- DAAG::hurricNamed
hurricNamed <- gam(car::yjPower(deaths, lambda=-0.2) ~
hurricS.gam s(log(BaseDam2014)) + s(LF.PressureMB),
data=hurricNamed, method="ML")
anova(hurricS.gam)
plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=1, fg="gray")
mtext(side=3, line=1, "A: Term in log(BaseDam2014)", cex=1.0, adj=0, at=-3.75)
plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=2, fg="gray")
mtext(side=3, line=1, "B: Term in LF.PressureMB", cex=1.0, adj=0, at=878)
qqnorm(resid(hurricS.gam), main="", fg="gray")
mtext(side=3, line=1, "C: Q-Q plot of residuals", cex=1.0, adj=0, at=-4.25)
An explanatory variable with an overly long-tailed distribution
<- gam(log(deaths+1) ~ s(log(BaseDam2014)), data=hurricNamed)
hurricSlog1.gam <- gam(log(deaths+1) ~ s(BaseDam2014), data=hurricNamed) hurricSlog2.gam
plot(hurricSlog1.gam, resid=TRUE, pch=16, cex=0.5, adj=0, fg="gray")
mtext(side=3, "A: Use log(BaseDam2014)", cex=1.4, adj=0, line=1, at=-3.15)
plot(hurricSlog2.gam, resid=TRUE, pch=16, cex=0.5, fg="gray")
mtext(side=3, "B: Use BaseDam2014", cex=1.4, adj=0, line=1, at=-28500)
Subsection 4.4.10: Other smoothing methods
Section 4.5 Quantile regression
## If necessary, install the 'WDI' package & download data
if(!file.exists("wdi.RData")){
if(!is.element("WDI", installed.packages()[,1]) )install.packages("WDI")
<- c('SP.DYN.TFRT.IN','SP.DYN.LE00.IN', 'SP.POP.TOTL')
inds <- c("FertilityRate", "LifeExpectancy", "population")
indnams <- WDI::WDI(country="all", indicator=inds, start=2020, end=2020,
wdi2020 extra=TRUE)
<- na.omit(droplevels(subset(wdi2020, !region %in% "Aggregates")))
wdi2020 <- setNames(wdi2020[order(wdi2020[, inds[1]]),inds], indnams)
wdi save(wdi, file="wdi.RData")
}
2020 World Bank data on fertility and life expectancy
load("wdi.RData") # Needs `wdi.RData` in working directory; see footnote
library(qgam)
"ppop"] <- with(wdi, population/sum(population))
wdi[, "logFert"] <- log(wdi[,"FertilityRate"])
wdi[,<- LifeExpectancy ~ s(logFert)
form ## Panel A model
<- qgam(form, data=wdi, qu=.5)
fit.qgam ## Panel B: Multiple (10%, 90% quantiles; unweighted, then weighted
<- mqgam(form, data=wdi, qu=c(.1,.9))
fit19.mqgam <- mqgam(form, data=wdi, qu=c(.1,.9),
wtd19.mqgam argGam=list(weights=wdi[["ppop"]]))
<- cbind(LifeExpectancy=wdi[, "LifeExpectancy"], logFert=wdi[,"logFert"],
hat50 as.data.frame(predict(fit.qgam, se=T)))
<- within(hat50, {lo <- fit-2*se.fit; hi <- fit+2*se.fit})
hat50 <- as.data.frame(matrix(nrow=nrow(wdi), ncol=4))
hat19 for(i in 1:2){hat19[[i]] <- qdo(fit19.mqgam, c(.1,.9)[i], predict)
+2]] <- qdo(wtd19.mqgam, c(.1,.9)[i], predict) }
hat19[[i## NB, can replace `predict` by `plot`, or `summary`
colnames(hat19) <- c(paste0(rep(c('q','qwt'),c(2,2)), rep(c('10','90'),2)))
<- cbind(hat19, logFert=wdi[,"logFert"]) hat19
## Panel A: Fit with SE limits, 50% quantile
<- xyplot(lo+fit+hi~logFert, data=hat50, lty=c(2,1,2),lwd=1.5,type='l') +
gphA ::as.layer(xyplot(LifeExpectancy~logFert,
latticeExtradata=hat50, pch='.', cex=2))
## Panel B: Multiple quantiles; unweighted and weighted fits
<- xyplot(q10+q90+qwt10+qwt90 ~ logFert, type="l",
gph19 data=hat19, lty=rep(1:2,c(2,2)),lwd=1.5)
<- xyplot(LifeExpectancy ~ logFert, data=wdi) + as.layer(gph19)
gphB update(c("A: 50% curve, 2 SE limits"=gphA, "B: 0.1, 0.9 quantiles"=gphB,
x.same=T, y.same=T), between=list(x=0.5),
xlab="Fertility Rate", ylab="Life Expectancy",
scales=list(x=list(at=log(2^((0:5)/2)), labels=round(2^((0:5)/2),1)),
alternating=F),
par.settings=DAAG::DAAGtheme(color=F, col='gray50', cex=2, pch='.'))
## Plots for the individual quantiles can be obtained thus:
## ## Panel A
plot(fit.qgam, shift=mean(predict(fit.qgam)))
## Panel B, 10% quantile
<- qdo(fit19.mqgam, qu=0.1)
fitm10 plot(fitm10, resid=T, shift=mean(predict(fitm10)),
ylim=range(wdi$LifeExpectancy), cex=2)
<- qdo(wtd19.mqgam, qu=0.1)
wfitm10 plot(wfitm10, resid=T, shift=mean(predict(wfitm10)),
ylim=range(wdi$LifeExpectancy), cex=2)
Section 4.6: Further reading and remarks
Exercises (4.7)
4.2
<- lm(depression~weight, data=DAAG::roller)
roller.lm <- lm(depression~weight+I(weight^2), data=DAAG::roller) roller.lm2
4.4
<- DAAG::toycars
toycars ::xyplot(distance ~ angle, groups=factor(car), type=c('p','r'),
latticedata=toycars, auto.key=list(columns=3))
4.4a
<- lm(distance ~ 0+factor(car)+angle, data=toycars)
parLines.lm <- lm(distance ~ factor(car)/angle, data=toycars) sepLines.lm
4.4b
<- lm(distance ~ factor(car)/angle+poly(angle,3)[,2:3], data=toycars) sepPol3.lm
4.4c
sapply(list(parLines.lm, sepLines.lm, sepPol3.lm), AICcmodavg::AICc)
4.4e
setNames(sapply(list(parLines.lm, sepLines.lm, sepPol3.lm),
function(x)summary(x)$adj.r.squared), c("parLines","sepLines","sepPol3"))
4,7
<- lm(grain ~ rate + I(rate^2), data=seedrates)
seedrates.lm <- lm(grain ~ poly(rate,2), data=seedrates) seedrates.pol
4.10a
<- gam(thickness ~ s(distance), data=DAAG::geophones) geo.gam
4.11
plot(DAAG::geophones$distance, acf(resid(geo.gam), lag.max=55)$acf)
Box.test(resid(geo.gam), lag=10)
Box.test(resid(geo.gam), lag=20)
Box.test(resid(geo.gam), lag=20, type="Ljung")
4.15
library(mgcv)
<- data.frame(x=1:200, y=arima.sim(list(ar=0.75), n=200))
xy <- gam(y ~ s(x), data=xy)
df.gam plot(df.gam, residuals=TRUE)
4.16
library(mgcViz)
<- gam(kohms ~ s(juice, bs="tp"), data=fruitohms,
ohms.tpBIC gamma=log(nrow(fruitohms))/2, method="REML")
<- mgcViz::getViz(ohms.tpBIC) # Convert to a `gamViz` object
ohms.gamViz <- plot(sm(ohms.gamViz, 1)) # Graphics object for term 1 (of 1)
g1 + l_fitLine(colour = "red") + l_rug(mapping = aes(x=x, y=y), alpha = 0.4) +
g1 l_ciLine(mul = 2, colour = "blue", linetype = 2) + # Multiply SE by `mul`
l_points(shape = 19, size = 1, alpha = 0.5)
4.16a
plot(sm(ohms.gamViz, 1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()
4.16b
gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside) |>
getViz() -> gas.gamViz
plot(sm(gas.gamViz,1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch4.R")
}