2 Generalizing from models
Packages required (plus any dependencies)
DAAG MASS qra investr HistData BHH2 xtable BayesFactor boot zoo boot MCMCpack,
Additionally, knitr and Hmisc are required in order to process the Rmd source file.
Section 2.1 Model assumptions
Subsection 2.1.1: Inferences are never assumption free
Subsection 2.1.2: Has account been taken of all relevant effects?
## Tabulate by Admit and Gender
<- 100*prop.table(margin.table(UCBAdmissions, margin=1:2), margin=2)
byGender round(byGender,1)
## Admission rates, by department
<- 100*prop.table(UCBAdmissions, margin=2:3)["Admitted", , ]
pcAdmit round(pcAdmit,1)
<- margin.table(UCBAdmissions, margin=2:3)
applied <- 100*prop.table(UCBAdmissions, margin=2:3)["Admitted", , ]
pcAdmit <- 100*prop.table(margin.table(UCBAdmissions,
byGender margin=1:2), margin=2)
<- dimnames(UCBAdmissions)
dimnam <- data.frame(Admit=c(pcAdmit[1,],pcAdmit[2,]),
mfStats Applicants=c(applied[1,], applied[2,]),
mf=factor(rep(dimnam[['Gender']],c(6,6)),
levels=dimnam[['Gender']]), Department=rep(dimnam[["Dept"]],2))
<- c(0, max(mfStats$Admit)*1.025)
xlim <- c(0, max(mfStats$Applicants)*1.075)
ylim plot(Applicants~Admit, data=mfStats, type="h",lwd=2, xlim=xlim, ylim=ylim,
fg="gray", cex.lab=1.2, col=rep(c("blue","red"),rep(6,2)),
xlab="UCB Admission rates (%), 1973", ylab="Number of applicants")
<- rbind(pcAdmit[1,], apply(pcAdmit,2, mean)+2, pcAdmit[2,], rep(NA,6))
pcA 2,3] <- pcA[2,3]+1
pcA[<- rbind(applied[1,], apply(applied,2, mean)+80,
appA 2,], rep(NA,6))
applied[<- dimnam[[3]]
deptNam for(j in 1: ncol(appA)) lines(pcA[,j], appA[,j], col="gray", lwd=0.8)
points(pcA[2,],appA[2,], pch=16, cex=1.1, col="white")
text(pcA[2,],appA[2,],deptNam, cex=0.85)
##
par(xpd=TRUE)
text(byGender[1,1:2], rep(par()$usr[4],2)+0.5*strheight("^"),
labels=c("^","^"), col=c("blue","red"),cex=1.2,srt=180)
text(byGender[1,], par()$usr[4]+1.4*strheight("A"),
labels=paste(round(byGender[1,],1)),cex=0.85)
text(byGender[1,1:2]+c(-3.5,3.5), rep(par()$usr[4],2)+2.65*strheight("A"),
labels=c("All males","All females"), pos=c(4,2), cex=1.2)
par(xpd=FALSE)
abline(h=200*(0:4),col="lightgray",lty="dotted")
abline(v=20*(0:4),col="lightgray",lty="dotted")
legend("topleft", col=c('blue','red'),lty=c(1,1), lwd=0.75, cex=0.9,
y.intersp=0.65, legend=c("Males","Females"),bty="n")
## Calculate totals, by department, of males & females applying
margin.table(UCBAdmissions, margin=2:3)
Subsection 2.1.3: The limitations of models
Subsection 2.1.4: Use the methodology that best suits the task in hand?
Section 2.2: t-statistics, binomial proportions, and correlations
Subsection 2.2.1: One- and two-sample t-tests
Subsection 2.2.2: A two-sample comparison
<- sapply(DAAG::two65,
stats2 function(x) c(av=mean(x), sd=sd(x), n=length(x)))
<- sqrt( sum(stats2['n',]*stats2['sd',]^2)/sum(stats2['n',]-1) )
pooledsd <- setNames(c(as.vector(stats2), pooledsd),
stats2 c('av1','sd1','n1','av2','sd2','n2','pooledsd'))
print(stats2, digits=4)
with(DAAG::two65, t.test(heated, ambient, var.equal=TRUE))
When is pairing helpful?
<- paste("Second versus first member, for each pair. The first",
titl "\npanel is for the elastic band data. The second (from",
"\nDarwin) is for plants of the species Reseda lutea")
<- par(pty="s")
oldpar on.exit(par(oldpar))
::onesamp(dset = DAAG::pair65, x = "ambient", y = "heated",
DAAGxlab = "Amount of stretch (ambient)",
ylab = "Amount of stretch (heated)", fg='gray')
## Data set mignonette holds the Darwin (1877) data on Reseda lutea.
## Data were in 5 pots, holding 5,5,5,5,4 pairs of plants respectively.
::onesamp(dset = DAAG::mignonette, x = "self", y = "cross",
DAAGxlab = "Height of self-fertilised plant", ylab =
"Height of cross-fertilised plant", dubious = 0, cex=0.7, fg='gray')
Subsection 2.2.3: The normal approximation to the binomial
Subsection 2.2.4: The Pearson or product–moment correlation
## Pearson correlation between `body` and `brain`: Animals
<- MASS::Animals
Animals <- with(Animals, cor(body, brain))
rho ## Pearson correlation, after log transformation
<- with(log(Animals), cor(body, brain))
rhoLogged ## Spearman rank correlation
<- with(Animals, cor(body, brain, method="spearman"))
rhoSpearman c(Pearson=round(rho,2), " Pearson:log values"=round(rhoLogged,2),
Spearman=round(rhoSpearman,2))
Section 2.3 Extra-binomial and extra-Poisson variation
<- data.frame(number=0:12, freq=unname(qra::malesINfirst12[["freq"]]))
maleDF <- sum(maleDF$freq)
N <- with(maleDF, weighted.mean(number, freq))/12
pihat <- dbinom(0:12, size=12, prob=pihat)
probBin rbind(Frequency=setNames(maleDF$freq, nm=0:12),
binomialFit=setNames(probBin*N, nm=0:12),
rawResiduals = maleDF$freq-probBin*N,
SDbinomial=sqrt(probBin*(1-probBin)*N)) |>
formatC(digits=2, format="fg") |> print(digits=2, quote=F, right=T)
set.seed(29)
rqres.plot(doBI, plot.type='all', type="QQ", main=""); box(col='white')
mtext(side=3, line=0.5, "A: Binomial model, Q-Q", adj=0, cex=1.25)
rqres.plot(doBI, plot.type='all', type="wp", main=""); box(col='white')
## Plots C, D, E, F: Set object name; set`type="wp" (C, E, F), or`"QQ"` (D)
mtext(side=3, line=0.5, "B: Binomial, worm plot 1", adj=-0.05, cex=1.25)
rqres.plot(doBI, plot.type='all', type="wp", main=""); box(col='white')
mtext(side=3, line=0.5, "C: Binomial, worm plot 2", adj=-0.05, cex=1.25)
rqres.plot(doBB, plot.type='all', type="QQ", main="", ylab=''); box(col='white')
mtext(side=3, line=0.5, "D: BB model, Q-Q", adj=0, cex=1.25)
rqres.plot(doBB, plot.type='all', type="wp", main="", ylab=''); box(col='white')
mtext(side=3, line=0.5, "E: BB, worm plot 1", adj=0, cex=1.25)
rqres.plot(doBB, plot.type='all', type="wp", main="", ylab=''); box(col='white')
mtext(side=3, line=0.5, "F: BB, worm plot 2", adj=0, cex=1.25)
<- AIC(doBI, doBB)
aicStat rownames(aicStat) <-
c(doBI="Binomial", doBB="Betabinomial")[rownames(aicStat)]
$dAIC <- with(aicStat, round(AIC-AIC[1],1))
aicStat aicStat
## Numbers of accidents in three months, with Poisson fit
<- data.frame(number=0:8, freq=c(296, 74, 26, 8, 4, 4, 1, 0, 1))
machinists <- sum(machinists[['freq']])
N <- with(machinists, weighted.mean(number, freq))
lambda <- dpois(0:8, lambda)*sum(machinists[['freq']])
fitPoisson rbind(Frequency=with(machinists, setNames(freq, number)),
poissonFit=fitPoisson) |>
formatC(digits=2, format="fg") |> print(quote=F, digits=2, right=T)
set.seed(23)
rqres.plot(doPO, plot.type='all', type="QQ", main=""); box(col='white')
## Repeat, changing the argument, for remaining plots
mtext(side=3, line=0.5, "A: Poisson, Q-Q plot", adj=0, cex=1.25)
rqres.plot(doPO, plot.type='all', type="wp", main="", ylab=''); box(col='white')
mtext(side=3, line=0.5, "B: Poisson, worm plot", adj=0, cex=1.25)
rqres.plot(doNBI, plot.type='all', type="wp", main="", ylab='')
mtext(side=3, line=0.5, "C: NBI, worm plot", adj=0, cex=1.25); box(col='white')
Subsection 2.3.2: *Technical details – extra-binomial or extra-Poisson variation
<- exp(coef(doBB, "sigma"))
sigma cat("Phi =", (1+12*sigma)/(1+sigma))
<- exp(coef(doNBI, "mu"))
mu <- exp(coef(doNBI, "sigma"))
sigma cat("Phi =", (1+sigma*mu))
Section 2.4 Contingency tables
## 'Untreated' rows (no training) from psid3, 'treated' rows from nswdemo
<- rbind(DAAG::psid3, subset(DAAG::nswdemo, trt==1))
nswpsid3 <- with(nswpsid3, table(trt,nodeg))
degTAB # Code 'Yes' if completed high school; 'No' if dropout
dimnames(degTAB) <- list(trt=c("PSID3_males","NSW_male_trainees"),
deg =c("Yes","No"))
degTAB
# To agree with hand calculation below, specify correct=FALSE
chisq.test(degTAB, correct=FALSE)
An example where a chi-squared test may not be valid
## Engine man data
<- matrix(c(5,3,17,85), 2,2)
engineman chisq.test(engineman)
Rare and endangered plant species
fisher.test(engineman)
## Enter the data thus:
<- matrix(c(37,190,94,23, 59,23,10,141, 28,15,58,16), ncol=3,
rareplants byrow=TRUE, dimnames=list(c("CC","CR","RC","RR"), c("D","W","WD")))
<- chisq.test(rareplants)) (x2
Examination of departures from a consistent overall row pattern
## Expected values
$expected x2
options(digits=2)
## Standardized residuals
residuals(x2)
Section 2.5 Issues for Regression with a single explanatory variable
Subsection 2.5.1: Iron slag example — check residuals with care!
<- c("A: Fitted line", "B: Residuals from line", "C: Variance check")
leg <- order(DAAG::ironslag[["magnetic"]])
ord <- DAAG::ironslag[ord,]
ironslag <- lm(chemical~magnetic, data=ironslag)
slagAlpha.lm <- residuals(slagAlpha.lm)
resval <- fitted(slagAlpha.lm)
fitchem <- sqrt(abs(resval))
sqrtabs2 plot(chemical~magnetic, xlab = "Magnetic", ylab = "Chemical",
pch = 1, data=ironslag, fg="gray")
lines(fitchem~ironslag[["magnetic"]])
mtext(side = 3, line = 0.25, leg[1], adj=-0.1, cex=0.925)
scatter.smooth(resval~ironslag[["magnetic"]], lpars=list(col="red"), span=0.8,
xlab = "Magnetic", ylab = "Residual", fg="gray")
mtext(side = 3, line = 0.25, leg[2], adj = -0.1, cex=0.925)
scatter.smooth(sqrtabs2 ~ fitchem, lpars=list(col="red"), span=0.8,
xlab = "Predicted chemical", fg="gray",
ylab = expression(sqrt(abs(residual))))
mtext(side = 3, line = 0.25, leg[3], adj = -0.1, cex=0.8)
## Diagnostics from fit using loess()
<- c("D: Smooth, using loess()",
leg2 "E: Residuals from smooth",
"F: Variance check")
<- loess(chemical~magnetic, data=ironslag, span=0.8)
slag.loess <- slag.loess[["residuals"]]
resval2 <- slag.loess[["fitted"]]
fitchem2 <- sqrt(abs(resval2))
sqrtabs2 plot(chemical~magnetic, xlab = "Magnetic", ylab = "Chemical",
pch = 1, data=ironslag, fg="gray")
lines(fitchem2 ~ ironslag[["magnetic"]], col="red")
mtext(side = 3, line = 0.25, leg2[1], adj=-0.1, cex=0.925)
scatter.smooth(resval2~ironslag[["magnetic"]], span=0.8,
lpars=list(col="red"),
xlab = "Magnetic", ylab = "Residual", fg="gray")
mtext(side = 3, line = 0.25, leg2[2], adj = -0.1, cex=0.925)
scatter.smooth(sqrtabs2 ~ fitchem2, lpars=list(col="red"),
span=0.8, xlab = "Predicted chemical", fg="gray",
ylab = expression(sqrt(abs(residual))))
mtext(side = 3, line = 0.25, leg2[3], adj = -0.1, cex=0.925)
Subsection 2.5.2: The analysis of variance table
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm anova(roller.lm)
Subsection 2.5.3: Outliers, influence, and robust regression
<- DAAG::softbacks
softbacks <- softbacks[,"volume"]
x <- softbacks[,"weight"]
y <- lm(y ~ x)
u <- predict(u)
yhat <- resid(u)
res <- with(softbacks, cor(x, y))
r <- with(softbacks, range(volume))
xlim 2] <- xlim[2]+diff(xlim)*0.08
xlim[plot(y ~ x, xlab = "Volume (cc)", xlim=xlim,
data=softbacks, ylab = "Weight (g)", pch = 4,
ylim = range(c(y, yhat)), cex.lab=0.9, fg="gray")
abline(u$coef[1], u$coef[2], lty = 1)
<- par()$usr[c(2, 3)]
bottomright <- par()$cxy[1]
chw <- par()$cxy[2]
chh <- summary(u)$coef
z <- c(paste("a =", format(round(z[1, 1], 1)),
btxt " SE =", format(round(z[1, 2], 1))),
paste("b =", format(round(z[2, 1], 2)),
" SE =", format(round(z[2, 2], 2))))
legend(bottomright[1], bottomright[2],
legend=btxt, xjust=1, yjust=0, cex=0.8, bty="n")
<- lm(weight ~ volume, data=DAAG::softbacks)
softbacks.lm print(coef(summary(softbacks.lm)), digits=3)
plot(softbacks.lm, fg="gray",
caption = c("A: Residuals vs Fitted", "B: Normal Q-Q",
"C: Scale-Location", "", "D: Resids vs Leverage"))
Subsection 2.5.4: Standard errors and confidence intervals
Confidence intervals and tests for the slope
<- coef(summary(roller.lm))[2, 2]
SEb coef(roller.lm)[2] + qt(c(0.025,.975), 8)*SEb
SEs and confidence intervals for predicted values
## Code to obtain fitted values and standard errors (SE, then SE.OBS)
<- predict(roller.lm, se.fit=TRUE)
fit.with.se $se.fit # SE
fit.with.sesqrt(fit.with.se[["se.fit"]]^2+fit.with.se$residual.scale^2) # SE.OBS
predict(roller.lm, interval="confidence", level=0.95)
predict(roller.lm, interval="prediction", level=0.95) # CI for a new observation
## Depression vs weight, with 95\% pointwise bounds for both
## the fitted line and predicted values
::plotFit(roller.lm, interval="both", col.conf="red", fg="gray")
investrmtext(side=3,line=0.75, "A: Lawn roller data", cex=1.2, adj=-0.25)
## Male child vs father height, Galton's data
<- subset(HistData::GaltonFamilies, gender=="male")
galtonMales <- lm(childHeight~father, data=galtonMales)
galton.lm ::plotFit(galton.lm, interval="both", col.conf="red", hide=FALSE,
investrcol=adjustcolor('black',alpha=0.5), fg="gray")
mtext(side=3,line=0.75, "B: Son vs father heights", cex=1.2, adj=-0.25)
Implications for design
<-function(data,...)
panelci
{<-list(...)$nrows
nrows<-list(...)$ncols
ncolsif(ncols==1)axis(2, lwd=0, lwd.ticks=1)
if(ncols==1)axis(1, lwd=0, lwd.ticks=1) else
axis(3, lwd=0, lwd.ticks=1)
<-data$stretch; y<-data$distance
x<- lm(y ~ x)
u <- predict(u, interval="confidence")
upred <- data.frame(fit=upred[,"fit"],lower=upred[,"lwr"], upper=upred[,"upr"])
ci <-order(x)
ordlines(x[ord], ci[["fit"]][ord], lty=1, lwd=2)
lines(lowess(x[ord], ci[["upper"]][ord]), lty=2, lwd=2, col="grey")
lines(lowess(x[ord], ci[["lower"]][ord]), lty=2, lwd=2, col="grey")
}<- DAAG::elastic1
elastic1 <- DAAG::elastic2
elastic2 <-rbind(elastic2,elastic1)
xy<- c("Range of stretch 30-65 mm","Range of stretch 42-54 mm")
nam <-rep(nam, c(dim(elastic2)[1],dim(elastic1)[1]))
trial<-range(elastic2$stretch)
xlim<-range(elastic2$distance)
ylim<-split(xy,trial)
xy<-lapply(1:length(xy),function(i){c(as.list(xy[[i]]), list(xlim=xlim,
xyylim=ylim))})
names(xy) <- nam
::panelplot(xy,panel=panelci,totrows=1,totcols=2,
DAAGpar.strip.text=list(cex=.9), oma=c(4,4,2.5,2), fg='gray')
mtext(side = 2, line = 3.35, "Distance moved (cm)", cex=1.1, las=0)
mtext(side=1,line=3,"Amount of stretch (mm)", cex=1.1)
Subsection 2.5.5: There are two regression lines!
## There are two regression lines!
<- DAAG::pair65
pair65 <- function(x=pair65[, "ambient"], y=pair65[, "heated"],
bothregs xlab="Stretch (band at ambient)", ylab = "Stretch (heated band)", pch=16){
plot(y ~ x, xlab = xlab, ylab = ylab, pch = pch, fg="gray")
<- par()$usr[c(1, 4)] + c(0.5, -0.5) * par()$cxy
topleft text(topleft[1], topleft[2], paste("r =", round(cor(x, y), 2)), adj = 0)
<- lm(y ~ x)
u1 abline(u1$coef[1], u1$coef[2])
<- lm(x ~ y)
u2 abline( - coef(u2)[1]/coef(u2)[2], 1/coef(u2)[2], lty = 2)
}bothregs()
mtext(side = 3, line = 0.5, "A", adj = 0)
bothregs(x=trees[, "Girth"], y=trees[, "Height"],
xlab="Girth (in)", ylab <- "Height (ft)", pch=16)
mtext(side = 3, line = 0.5, "B", adj = 0)
Subsection 2.5.6: Logarithmic and Power Transformations
## Logarithmic and Power Transformations
::powerplot(expr="sqrt(x)", xlab="")
DAAG::powerplot(expr="x^0.25", xlab="", ylab="")
DAAG::powerplot(expr="log(x)", xlab="", ylab="")
DAAG::powerplot(expr="x^2")
DAAG::powerplot(expr="x^4", ylab="")
DAAG::powerplot(expr="exp(x)", ylab="") DAAG
Subsection 2.5.7: General forms of nonlinear response
Subsection 2.5.8: Size and shape data – allometric growth
## Heart weight versus body weight, for 30 Cape fur seals.
.12 <- function()
g2
{<- DAAG::cfseal
cfseal <- log(cfseal[,"weight"])
x <- log(cfseal[, "heart"])
y <- log(c(82.5,1100))
ylim <- log(c(17,180))
xlim <- "Heart weight (g, log scale)"
ylab <- "Body weight (kg, log scale)"
xlab <- c(20,40,80,160)
xtik <- c(100,200,400,800)
ytik plot(x, y, xlab = xlab, ylab = ylab, axes = F, xlim =
ylim = ylim, pch = 16, cex=0.85, fg="gray", cex.lab=1.1)
xlim, axis(1, at = log(xtik), labels = paste(xtik), lwd=0, lwd.ticks=1)
axis(2, at = log(ytik), labels = paste(ytik), lwd=0, lwd.ticks=1)
box(col="gray")
<- formula(y ~ x)
form1 <- lm(form1, data = cfseal)
u abline(u$coef[1], u$coef[2])
<- summary(u)$coef
usum options(digits=3)
print(usum)
<- par()$cxy
cwh <- paste("log y =", round(usum[1, 1], 2), " [",
eqn round(usum[1, 2], 2), "] +", round(usum[2, 1], 3),
" [", round(usum[2, 2], 3), "] log x")
mtext(side=3, line=1.15, eqn, adj = 0.4, cex = 0.8)
mtext(side=3, line=0.25, "(Values in square brackets are SEs)", adj = 0.4, cex = 0.8)
}g2.12()
The allometric growth equation
options(scipen=4)
<- lm(log(heart) ~ log(weight), data=DAAG::cfseal)
cfseal.lm print(coef(summary(cfseal.lm)), digits=4)
Section 2.6 Empirical assessment of predictive accuracy
Subsection 2.6.1: The training/test approach, and cross-validation
Cross-validation – a tutorial example
<- DAAG::houseprices
houseprices <- DAAG::CVlm(houseprices, form.lm = formula(sale.price ~ area),m=3,printit=F,plotit=FALSE)
df <- function(x,y,subscripts,groups, ...){
panelfun ::panel.superpose(x,y,subscripts,groups, ...)
lattice::panel.superpose(x,df[["cvpred"]],subscripts,groups,type="b", cex=0.5, ...)
lattice
}<- lattice::xyplot(sale.price ~ area, groups=fold, data=df, pch=1:3, panel=panelfun)
gph <- DAAG::DAAGtheme(color=T, lty=1:3, pch=1:3, lwd=2)
parset <- list(lines=TRUE, columns=3, between.columns=1.5, between=1, cex=0.85)
keylist update(gph, par.settings=parset, auto.key=keylist)
set.seed(29) # Generate results shown
<- sample(rep(1:3, length=15))
rand ## sample() randomly permutes the vector of values 1:3
for(i in 1:3) cat(paste0(i,":"), (1:15)[rand == i],"\n")
<- DAAG::houseprices
houseprices row.names(houseprices) <- (1:nrow(houseprices))
::CVlm(houseprices, form.lm = formula(sale.price ~ area), plotit=FALSE) DAAG
## Estimate of sigma^2 from regression output
<- DAAG::houseprices
houseprices <- lm(sale.price ~ area, houseprices)
houseprices.lm summary(houseprices.lm)[["sigma"]]^2
Subsection 2.6.2: Bootstrapping in regression
<- DAAG::houseprices
houseprices <- lm(sale.price ~ area, houseprices)
houseprices.lm print(coef(summary(houseprices.lm)),digits=2)
<-
houseprices.fn function (houseprices, index,
statfun=function(obj)coef(obj)[2]){
<- houseprices[index, ]
house.resample <- lm(sale.price ~ area, data=house.resample)
house.lm statfun(house.lm) # slope estimate for resampled data
}
set.seed(1028) # use to replicate the exact results below
library(boot) # ensure that the boot package is loaded
## requires the data frame houseprices (DAAG)
<- boot(houseprices, R=999, statistic=houseprices.fn)) (houseprices.boot
<- function(obj)predict(obj, newdata=data.frame(area=1200))
statfun1200 <- boot(houseprices, R=999, statistic=houseprices.fn,
price1200.boot statfun=statfun1200)
boot.ci(price1200.boot, type="perc") # "basic" is an alternative to "perc"
set.seed(1111)
library(boot)
par(las=0)
<-function (houseprices,index){
houseprices2.fn<-houseprices[index,]
house.resample<-lm(sale.price~area,data=house.resample)
house.lm$sale.price-predict(house.lm,houseprices)
houseprices# resampled prediction errors
}<- DAAG::houseprices
houseprices <-nrow(houseprices)
n<- 199 ## Will obtain 199 estimates of prediction error
R <-lm(sale.price~area,data=houseprices)
houseprices.lm<-boot(houseprices, R=R, statistic=houseprices2.fn)
houseprices2.boot<-factor(rep(1:n,rep(R,n)))
house.facplot(house.fac,as.vector(houseprices2.boot$t),
ylab="", xlab="House", fg="gray")
mtext(side=2, line=2, "Prediction Errors")
mtext(side = 3, line = 0.5, "A", adj = 0)
<- apply(houseprices2.boot$t,2,sd)
boot.se <- predict.lm(houseprices.lm,se.fit=T)$se.fit
model.se plot(boot.se/model.se, ylab="", xlab="House",pch=16, fg="gray")
mtext(side=2, line=2.0, "Ratio of SEs\nBootstrap to Model-Based", cex=0.9)
mtext(side = 3, line = 0.5, "B", adj = 0)
abline(1,0)
Section 2.7 One- and two-way comparisons
Subsection 2.7.1: One-way comparisons
<- data.frame(Weight = c(1.5, 1.9, 1.3, 1.5, 2.4, 1.5, # Water
tomato 1.5, 1.2, 1.2, 2.1, 2.9, 1.6, # Nutrient
1.9, 1.6, 0.8, 1.15, 0.9, 1.6), # Nutrient+24D
trt = factor(rep(c("Water", "Nutrient", "Nutrient+24D"), c(6, 6, 6))))
## Make `Water` the first level of trt. In aov or lm calculations, it is
## then taken as the baseline or reference level.
$trt <- relevel(tomato$trt, ref="Water") tomato
## A: Weights of tomato plants (g)
library(lattice, quietly=TRUE)
<- stripplot(trt~Weight, aspect=0.35, scale=list(tck=0.6), data=tomato)
gph update(gph, scales=list(tck=0.4), cex=0.9, col="black", xlab="",
main=list('A: Weights of tomato plants (g)', y=0, cex=1.1))
## B: Summarize comparison between LSD and Tukey's HSD graphically
<- aov(Weight ~ trt, data=tomato)
tomato.aov ::onewayPlot(obj=tomato.aov)
DAAGtitle(main="B: LSD, compared with Tukey HSD", adj=0.1, outer=T,
line=-1.0, font.main=1, cex.main=1.25)
::anovaPlot(tomato.aov) BHH2
The analysis of variance table
## Do analysis of variance calculations
anova(tomato.aov)
Subsection 2.7.2: Regression versus qualitative comparisons – issues of power
<- DAAG::simulateLinear(alpha=0.6, seed=17, aspect='iso')
gph update(gph, par.settings=DAAG::DAAGtheme(color=FALSE, alpha=0.4))
Subsection 2.7.3: *Severe multiplicity — the false discovery rate
The false discovery rate (FDR)
<- DAAG::coralPval
coralPval <- c(0.05, 0.02, 0.01, 0.001)
pcrit <- sapply(pcrit, function(x)sum(coralPval<=x)) under
<- pcrit*length(coralPval) expected
<- data.frame(Threshold=pcrit, Expected=expected,
fdrtab Discoveries=under, FDR=round(expected/under, 4))
print(xtable::xtable(fdrtab), include.rownames=FALSE, hline.after=FALSE)
<- p.adjust(coralPval, method="BH") fdr
<- c(0.05, 0.04, 0.02, 0.01)
fdrcrit <- sapply(fdrcrit, function(x)sum(coralPval<=x))
under setNames(under, paste(fdrcrit))
Subsection 2.7.4: Data with a two-way structure, i.e., two factors
par(fig=c(0.525,1,0,1), mgp=c(1.5,0.4,0))
<- c("F10", "NH4Cl", "NH4NO3", "F10 +ANU843",
lev "NH4Cl +ANU843", "NH4NO3 +ANU843")
<- within(DAAG::rice, trt <- factor(trt, levels=lev))
rice with(rice, interaction.plot(fert, variety, ShootDryMass, fg="gray",
legend = FALSE, xlab="Fertiliser", cex.lab=0.95, mex=0.65))
<- par()$usr[2]
xleg <- par()$usr[4] - 0.72 * diff(par()$usr[3:4])
yleg <- legend(xleg, yleg, bty = "n", legend = levels(rice$variety),
leginfo col = 1, lty = 2:1, lwd=1, xjust = 1, cex = 0.8,
y.intersp=0.8)$rect
text(leginfo$left + 0.5 * leginfo$w, leginfo$top, " variety",
adj = 0.5, cex = 0.8)
mtext(side=3, line=0.65, cex=0.9, adj=-0.15, "B")
<- dotplot(trt ~ ShootDryMass, pch=1, cex=0.9, las=2,
gph xlab="Shoot dry mass (g)", data=rice,
panel=function(x,y,...){panel.dotplot(x,y,...)
<- sapply(split(x,y),mean);
av <- unique(y)
ypos lpoints(ypos~av, pch=3, col="gray40", cex=1.25)},
main=list("A", cex=0.88, just="left", x=0.1, y=-0.7, font=1))
<- DAAG::DAAGtheme(fontsize=list(text=9, points=6), color=FALSE)
pars print(update(gph, scales=list(tck=0.5), par.settings=pars, aspect=0.9),
position=c(-0.065,0.0,0.6,1), newpage=FALSE)
Subsection 2.7.5: Presentation issues
Section 2.8 Data with a nested variation structure
Subsection 2.8.1: Degrees of freedom considerations
Subsection 2.8.2: General multi-way analysis of variance designs
Section 2.9 Bayesian estimation – further commentary and approaches
Subsection 2.9.1: Bayesian estimation with normal priors and likelihood
Subsection 2.9.2: Further comments on Bayes Factors
A note on the Bayesian Information Criterion
<- c(.05,.01,.001); np <- length(pval)
pval <- c(4,6,10,20,40,80,160); nlen <- length(Nval)
Nval ## Difference in BIC statistics, interpreted as Bayes factor
<- function(p,N){t <- qt(p/2, df=N-1, lower.tail=FALSE)
t2BFbic exp((N*log(1+t^2/(N-1))-log(N))/2)}
<- outer(pval, Nval, t2BFbic)
bicVal ## Bayes factor, calculated using BayesFactor::ttest.tstat()
<- function(p, N){t <- qt(p/2, df=N-1, lower.tail=FALSE)
t2BF ::ttest.tstat(t=t, n1=N, simple=TRUE, rscale = "medium")}
BayesFactor<- matrix(nrow=np, ncol=nlen)
BFval for(i in 1:np)for(j in 1:nlen) BFval[i,j] <- t2BF(pval[i], Nval[j])
<- rbind(BFval, bicVal)[c(1,4,2,5,3,6),]
cfVal dimnames(cfVal) <- list(
paste(rep(pval,rep(2,np)), rep(c("- from ttest.tstat", "- from BIC"),np)),
paste0(c("n=",rep("",nlen-1)),Nval))
round(cfVal,1)
Subsection 2.9.3: Bayesian regression estimation using the MCMCpack package
suppressPackageStartupMessages(library(MCMCpack))
<- MCMCregress(depression ~ weight, data=DAAG::roller)
roller.mcmc summary(roller.mcmc)
<- matrix(c(1:6), byrow=TRUE, ncol=2)
mat layout(mat, widths=rep(c(2,1.1),3), heights=rep(0.9,8))
# NB: widths & heights are relative
plot(roller.mcmc, auto.layout=FALSE, ask=FALSE, col="gray", fg="gray")
Section 2.10: Recap
Section 2.11: Further reading
Exercises (2.12)
2.2
## UCBAdmissions is in the datasets package
## For each combination of margins 1 and 2, calculate the sum
<- apply(UCBAdmissions, c(1,2), sum) UCBtotal
2.2b
apply(UCBAdmissions, 3, function(x)(x[1,1]*x[2,2])/(x[1,2]*x[2,1]))
2.3
<- array(c(30,30,10,10,15,5,30,10), dim=c(2,2,2))
tabA <- array(c(30,30,20,10,10,5,20,25), dim=c(2,2,2)) tabB
2.5
<- function(r) .5*log((1+r)/(1-r))
z.transform <- function(z) (exp(2*z)-1)/(exp(2*z)+1)
z.inverse <- function(data, indices) {
possum.fun <- data$chest[indices]
chest <- data$belly[indices]
belly z.transform(cor(belly, chest))}
<- boot::boot(DAAG::possum, possum.fun, R=999)
possum.boot z.inverse(boot.ci(possum.boot, type="perc")$percent[4:5])
# The 4th and 5th elements of the percent list element
# hold the interval endpoints. See ?boot.ci
2.11
with(pressure, MASS::boxcox(pressure ~ I(1/(temperature+273))))
2.14
"funRel" <-
function(x=leafshape$logpet, y=leafshape$loglen, scale=c(1,1)){
## Find principal components rotation; see Subsection 9.1.2
## Here (unlike 9.1.2) the interest is in the final component
<- prcomp(cbind(x,y), scale=scale)
xy.prc <- xy.prc$rotation[,2]/scale
b c(bxy = -b[1]/b[2]) # slope of functional equation line
}## Try the following:
<- DAAG::leafshape
leafshape funRel(scale=c(1,1)) # Take x and y errors as equally important
# Note that all lines pass through (mean(x), mean(y))
2.15
<- rbind(
P c(1 , 0 , 0 , 0 , 0 , 0),
c(.5, 0 , .5, 0 , 0 , 0),
c(0 , .5, 0 , .5, 0 , 0),
c(0 , 0 , .5, 0 , .5, 0),
c(0 , 0 , 0 , .5, 0 , .5),
c(0 , 0 , 0 , 0 , 0 , 1))
dimnames(P) <- list(0:5,0:5)
P
<- function(N=15, initial.value=1, transition=P, stopval=NULL)
Markov <- numeric(N)
{X 1] <- initial.value + 1 # States 0:(n-1); subscripts 1:n
X[<- nrow(transition)
n for (i in 2:N){
<- sample(1:n, size=1, prob=transition[X[i-1], ])
X[i] if(length(stopval)>0)if(X[i] %in% (stopval+1)){X <- X[1:i]; break}}
- 1
X
}# Set `stopval=c(0,5)` to stop when the player's fortune is $0 or $5
2.16
<- rbind(
Pb Sun = c(Sun=0.6, Cloud=0.2, Rain=0.2),
Cloud= c(0.2, 0.4, 0.4),
Rain= c(0.4, 0.3, 0.3))
Pb
2.16b
<-
plotmarkov function(n=1000, width=101, start=0, transition=Pb, npanels=5){
<- Markov(n, initial.value=start, transition)
xc2 <- zoo::rollmean(as.integer(xc2==0), k=width)
mav0 <- zoo::rollmean(as.integer(xc2==1), k=width)
mav1 <- cut(1:length(mav0), breaks=seq(from=1, to=length(mav0),
npanel length=npanels+1), include.lowest=TRUE)
<- data.frame(av0=mav0, av1=mav1, x=1:length(mav0), gp=npanel)
df print(xyplot(av0+av1 ~ x | gp, data=df, layout=c(1,npanels), type="l",
par.strip.text=list(cex=0.65), auto.key=list(columns=2),
scales=list(x=list(relation="free"))))
}
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch2.R")
}