5 Generalized linear models and survival analysis
Packages required (with dependencies)
DAAG car mgcv colorspace HistData gamlss dplyr tidyr MASS ggplot2 latticeExtra qgam VGAM survival HistData
Additionally, knitr is required in order to process the Rmd source file.
Section 5.1 Generalized linear models
Subsection 5.1.1: Linking the expected value to the covariate
## Simplified plot showing the logit link function
<- (1:39)/40
p <- log(p/(1 - p))
logitp plot(p, logitp, xlab = "Proportion", ylab = "logit(p)", type = "l", pch = 1)
par(las=0)
<- seq(from=1, to=99, by=1)/100; n<- 150; eps=0.001
p <- log(p/(1 - p))
gitp plot(p, gitp, xlab = "", ylab = "", type = "l", pch = 1,
las=1, xlim=0:1, xaxs="i", fg="gray")
mtext(side = 1, line = 1.75, expression("Proportion "*pi))
mtext(side = 2, line = 1.75,
expression("logit("*pi*") = log(Odds)"))
mtext(side = 3, line = 0.5, "A: Logit link", adj=0, cex=1.0)
<- c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999)
pval par(mgp = c(2.5, 0.5, 0))
## axis(1, at=c(0,1), lwd=0, labels=c(0,1), xpd=TRUE)
axis(4, adj=0.075, at = log(pval/(1 - pval)), las=1,
col="gray", labels = paste(pval), lwd=0, lwd.ticks=1)
<- sqrt(p*(1-p)/100)
seP plot(p, seP, xlab = "", ylab = "", type = "l", pch = 1,
las=1, xlim=0:1, xaxs="i", fg="gray")
## axis(1, at=c(0,1), lwd=0, labels=c(0,1), xpd=TRUE)
mtext(side = 1, line = 1.75, expression("Proportion "*pi))
mtext(side = 2, line = 2.25, expression("SD["*p*"], "*n*"=100"))
mtext(side = 3, line = 0.5,
expression("B: SD["*p*"], "*n*"=100"), adj=0, cex=1.0)
<- (p*(1-p)/n)*((p+eps)*(1-p+eps))^-2
seLP plot(p, seLP, xlab = "", ylab = "", type = "l", pch = 1,
las=1, xlim=0:1, xaxs="i", fg="gray")
## axis(1, at=c(0,1), lwd=0, labels=c(0,1), xpd=TRUE)
mtext(side = 1, line = 1.75, expression("Proportion "*pi))
mtext(side = 2, line = 1.75, expression("SD[logit("*p*")], "*n*"=100"))
mtext(side = 3, line = 0.5, "C: SD[logit(p)]", adj=0, cex=1.0)
Subsection 5.1.2: Noise terms need not be normal
Subsection 5.1.3: Variation that is greater than binomial or Poisson
Subsection 5.1.4: Log odds in contingency tables
Subsection 5.1.5: Logistic regression with a continuous explanatory variable
<- aggregate(DAAG::anesthetic[, c("move","nomove")],
anestot by=list(conc=DAAG::anesthetic$conc), FUN=sum)
## The column 'conc', because from the 'by' list, is then a factor.
## The next line recovers the numeric values
$conc <- as.numeric(as.character(anestot[["conc"]]))
anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum)
anestot$prop <- anestot$nomove/anestot$total anestot
par(mgp=c(2.5,.5,0))
<- DAAG::anesthetic
anesthetic <- table(anesthetic$nomove, anesthetic$conc)
z <- apply(z, 2, sum)
tot <- z[2, ]/(tot)
prop <- sum(z[2, ])/sum(tot)
oprop <- as.numeric(dimnames(z)[[2]])
conc par(las=0)
plot(conc, prop, xlab = "Concentration", ylab = "Proportion",
xlim=c(0.5, 2.5), ylim = c(0, 1), pch = 16, axes=F)
axis(1, cex=0.9, lwd=0, lwd.ticks=1)
axis(2, at=c(0, 0.5, 1.0), cex=0.9, lwd=0, lwd.ticks=1)
axis(2, at=c(0.25, 0.75), cex=0.9, lwd=0, lwd.ticks=1)
box(col="gray")
<- par()$cxy[2]
chh <- par()$cxy[1]
chw text(conc - 0.3 * chw, prop-sign(prop-0.5)*chh/4, paste(tot),
adj = 1, cex=0.65)
abline(h = oprop, lty = 2)
## Fit model directly to the 0/1 data in nomove
<- glm(nomove ~ conc, family=binomial(link="logit"),
anes.glm data=DAAG::anesthetic)
## Fit model to the proportions; supply total numbers as weights
<- glm(prop ~ conc, family=binomial(link="logit"),
anes1.logit weights=total, data=anestot)
::sumry(anes.glm, digits=2) DAAG
A note on model output
## Tp get coefficients, SEs, and associated statistics, specify:
print(coef(summary(anes.glm)), digits=2)
## Get full default output
summary(anes.glm, digits=2)
Section 5.2 Logistic multiple regression
<- DAAG::frogs frogs
## Presence/absence information: data frame frogs (DAAGS)
suppressMessages(library(ggplot2))
<- ggplot(frogs, aes(easting, northing)) +
p geom_point(size=3, alpha=0.25) + coord_fixed() +
xlab("Meters east of reference point")+ylab("Meters north") +
theme(axis.title=element_text(size=11), axis.text=element_text(size=8))
+ geom_point(data=subset(frogs, pres.abs==1),
p aes(easting, northing), alpha=1, shape=3, col="white", size=1.5)
<- within(frogs, {maxSubmin <- meanmax-meanmin
frogs <- meanmax+meanmin}) maxAddmin
Subsection 5.2.1: Choose explanatory terms, and fit model
## Find power transformations
<- c('distance','NoOfPools','NoOfSites','avrain','maxAddmin','maxSubmin')
useCols <- car::powerTransform(frogs[,useCols], family="yjPower")
tfrogs ## Create, for later use, a matrix with variables transformed as suggested
<- car::yjPower(frogs[,useCols], coef(tfrogs, round=TRUE))
transY summary(tfrogs, digits=2)
<- glm(formula = pres.abs ~ log(distance) + log(NoOfPools)+
frogs0.glm sqrt(NoOfSites) + avrain + maxAddmin + maxSubmin,
family = binomial, data = frogs)
::sumry(frogs0.glm, digits=1) DAAG
## Check effect of omitting sqrt(NoOfSites) and avrain from the model
## ~ . takes the existing formula. Precede terms to be
## omitted by '-'. (For additions, precede with '+')
<- update(frogs0.glm, ~ . -sqrt(NoOfSites)-avrain)
frogs.glm <- update(frogs.glm, ~ . -maxAddmin+altitude)
frogsAlt.glm AIC(frogs0.glm, frogs.glm,frogsAlt.glm)
rbind(
'frogs0.glm'=coef(frogs0.glm)[c('log(distance)','log(NoOfPools)','maxAddmin','maxSubmin')],
'frogs.glm'=coef(frogs.glm)[c('log(distance)','log(NoOfPools)','maxAddmin','maxSubmin')]
)coef(frogsAlt.glm)[c('log(distance)','log(NoOfPools)','altitude','maxSubmin')]
coef(summary(frogs.glm))
Subsection 5.2.2: Fitted values
## Use of `predict()` and `fitted()` --- examples
fitted(frogs.glm) # Fitted values' scale of response
predict(frogs.glm, type="response") # Same as fitted(frogs.glm)
predict(frogs.glm, type="link") # Scale of linear predictor
## For approximate SEs, specify
predict(frogs.glm, type="link", se.fit=TRUE)
library(ggplot2)
$Prob. <- fitted(frogs.glm)
frogs$presAbs <- factor(frogs$pres.abs)
frogs<- ggplot(frogs, aes(easting, northing, color=Prob.)) +
p geom_point(size=2, alpha=0.5) + coord_fixed() +
xlab("Meters east of reference point")+ylab("Meters north") +
theme(axis.title=element_text(size=9), axis.text=element_text(size=6))
<- p+scale_color_gradientn(colours=colorspace::heat_hcl(10,h=c(0,-100),
p2 l=c(75,40), c=c(40,80), power=1)) +
guides(fill=guide_legend(title=NULL))
+ geom_point(data=subset(frogs, presAbs==1),
p2 aes(easting, northing), alpha=1, shape=3, col="white", size=1)
Subsection 5.2.3: Plots that show the contributions of explanatory variables
<- par(mgp=c(2.1,.4,0), mfrow=c(1,3))
opar <- HistData::Cholera
Cholera <- glm(cholera_deaths ~ offset(log(popn)) + water +
fitP2.glm log(elevation+3) + poly(poor_rate,2) +I(elevation==350),
data=Cholera, family=quasipoisson)
"water"]] <- factor(Cholera[["water"]], labels=c("Battersea",
Cholera[["NewRiver","Kew"))
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='water', fg="gray")
axis(1, at=2, labels="NewRiver", lwd=0, line=0.75)
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='log(elevation + 3)', fg="gray")
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='poly(poor_rate, 2)', fg="gray")
par(opar)
Subsection 5.2.4: Cross-validation estimates of predictive accuracy
::CVbinary(frogs.glm) DAAG
set.seed(19)
<- frogs0.acc <- numeric(6)
frogs.acc for (j in 1:6){
<- sample(1:10, 212, replace=TRUE)
randsam ## Sample 212 values (one per pbservation) from 1:10
<- DAAG::CVbinary(frogs.glm, rand=randsam,
frogs.acc[j] print.details=FALSE)$acc.cv
<- DAAG::CVbinary(frogs0.glm, rand=randsam,
frogs0.acc[j] print.details=FALSE)$acc.cv
}print(rbind("frogs (all variables)" = frogs.acc,
"frogs0 (selected variables)" = frogs0.acc), digits=3)
Subsection 5.2.5: Cholera deaths in London — 1849 to 1855
By air, or by water — the 1849 epidemic
<- par(mgp=c(2.1,.4,0), mfrow=c(1,3))
opar <- HistData::Cholera
Cholera <- glm(cholera_deaths ~ offset(log(popn)) + water +
fitP2.glm log(elevation+3) + poly(poor_rate,2) +I(elevation==350),
data=Cholera, family=quasipoisson)
"water"]] <- factor(Cholera[["water"]], labels=c("Battersea",
Cholera[["NewRiver","Kew"))
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='water', fg="gray")
axis(1, at=2, labels="NewRiver", lwd=0, line=0.75)
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='log(elevation + 3)', fg="gray")
termplot(fitP2.glm, partial=T, se=TRUE, pch =1,
ylabs=rep("Partial residual",3), terms='poly(poor_rate, 2)', fg="gray")
par(opar)
The 1854 epidemic — a natural experiment
Section 5.3 Logistic models for categorical data – an example
## Create data frame from multi-way table UCBAdmissions (datasets)
## dimnames(UCBAdmissions) # Check levels of table margins
<- as.data.frame.table(UCBAdmissions["Admitted", , ], responseName="admit")
UCB $reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq
UCB$Gender <- relevel(UCB$Gender, ref="Male")
UCB## Add further columns total and p (proportion admitted)
$total <- UCB$admit + UCB$reject
UCB$pAdmit <- UCB$admit/UCB$total UCB
<- glm(pAdmit ~ Dept*Gender, family=binomial, data=UCB, weights=total)
UCB.glm ## Abbreviated `anova()` output:
anova(UCB.glm, test="Chisq") |>
capture.output() |> tail(8) |> (\(x)x[-c(2,3)])() |> cat(sep='\n')
round(signif(coef(summary(UCB.glm)),4), 3)
Section 5.4 Models for counts — poisson, quasipoisson, and negative binomial
Subsection 5.4.1: Data on aberrant crypt foci
par(pty="s")
plot(count ~ endtime, data=DAAG::ACF1, pch=16, fg="gray")
<- glm(formula = count ~ endtime + I(endtime^2),
ACF.glm family = poisson(link="identity"), data = DAAG::ACF1)
::sumry(ACF.glm, digits=2) DAAG
unique(round(predict(ACF.glm),2))
sum(resid(ACF.glm, type="pearson")^2)/19
<- glm(formula = count ~ endtime + I(endtime^2),
ACFq.glm family = quasipoisson, data = DAAG::ACF1)
print(coef(summary(ACFq.glm)), digits=2)
sapply(split(residuals(ACFq.glm), DAAG::ACF1$endtime), var)
fligner.test(resid(ACFq.glm) ~ factor(DAAG::ACF1$endtime))
Subsection 5.4.2: Moth habitat example
## Number of moths by habitat: data frame DAAG::moths
<- DAAG::moths
moths <- rbind(Number=table(moths[, 4]),
tab sapply(split(moths[, -4], moths$habitat), apply, 2, sum))
## Number of zero counts, by habitats
with(droplevels(subset(moths, A==0)), table(habitat))
library(lattice)
<- dotplot(habitat ~ A+P, data=DAAG::moths, xlab="Number of moths", outer=TRUE,
gph strip=strip.custom(factor.levels=paste("Number of species",c("A","B"))),
panel=function(x, y, ...){
panel.dotplot(x,y, pch=1, ...)
<- sapply(split(x,y),mean)
av <- factor(names(av), levels=names(av))
ypos lpoints(ypos~av, pch=3, col="gray45", cex=1.25)
},key=list(text=list(c("Individual transects", "Mean")),
points=list(pch=c(1,3), cex=c(1,1.25), col=c("black","gray45")),
columns=2), scales=list(tck=0.5, alternating=1))
<- list(fontsize=list(text=9, points=5))
bw9 update(gph, par.settings=bw9)
<- with(DAAG::moths, sapply(split(A, habitat),
Astats function(x)c(Amean=mean(x),Avar=var(x))))
<- with(DAAG::moths, sapply(split(meters, habitat), mean))
avlength round(rbind(Astats, avlen=avlength),1)
A quasipoisson model
<- glm(A ~ habitat + log(meters), family=quasipoisson,
A.glm data=DAAG::moths)
::sumry(A.glm, digits=1) DAAG
subset(DAAG::moths, habitat=="Bank")
## Analysis with tighter convergence criterion
<- update(A.glm, epsilon=1e-10)
A.glm print(coef(summary(A.glm)), digits=2)
<- predict(A.glm, se=TRUE)$se.fit
AfitSE <- with(DAAG::moths, c(AfitSE[habitat=="Bank"],
cfSE range(AfitSE[habitat!="Bank"])))
round(setNames(cfSE, c("SEbank", "SEotherMIN", "SEotherMAX")), digits=2)
A more satisfactory choice of reference level
<- DAAG::moths
moths $habitat <- relevel(moths$habitat, ref="Lowerside")
moths<- glm(A ~ habitat + log(meters),
Alower.glm family = quasipoisson, data = moths)
print(coef(summary(Alower.glm)), digits=1)
Subsection 5.4.3: Models with negative binomial errors
<- data.frame(sigma1A =(Astats[2,]-Astats[1,])/Astats[1,]^2,
dframe sigma2A =(Astats[2,]-Astats[1,])/Astats[1,]^1,
mu = Astats[1,], habitat=colnames(Astats))
<- list(fontsize=list(text=9, points=5), pch=1:7)
bw9 xyplot(sigma1A+sigma2A ~ mu, groups=habitat, outer=TRUE,
data=subset(dframe,habitat!="Bank"),
par.settings=bw9, auto.key=list(columns=4),
strip=strip.custom(factor.levels=paste("Model",c("NBI","NBII"))),
xlab="Mean number of species A moths",
ylab=expression("Estimate of "*sigma))
library(gamlss, quietly=TRUE)
<- subset(moths, habitat!='Bank')
noBank <- gamlss(A ~ log(meters)+habitat, family=NBI(), data=noBank,
mothsCon.lss trace=F)
<- gamlss(A ~ log(meters)+habitat, family=NBI(),
mothsVary.lss sigma.formula=~habitat, trace=FALSE, data=noBank)
LR.test(mothsCon.lss, mothsVary.lss)
## mothsCon.lss <- gamlss(A ~ log(meters)+habitat,family=NBI(),data=noBank)
## summary(mothsCon.lss, type="qr") ## Main part of output
Diagnostic plots
plot(mothsCon.lss, panel=panel.smooth)
Use of the square root link function
<- gamlss(A ~ habitat + sqrt(meters), trace=FALSE,
Asqrt.lss family = NBI(mu.link='sqrt'), data = moths)
## Asqrt.lss <- gamlss(A ~ habitat + sqrt(meters),
## family = NBI(mu.link='sqrt'), data = moths)
## summary(Asqrt.lss, type="qr") ## Main part of output
<- capture.output(summary(Asqrt.lss, digits=1))[-(3:10)]
out cat(out, sep="\n")
Subsection 5.4.4: Negative binomial versus alternatives — hurricane deaths
Aside – a quasibinomial binomial fit
<- with(DAAG::hurricNamed, order(BaseDam2014))
ordx <- DAAG::hurricNamed[ordx,]
hurric # Ordering a/c values of BaseDam2014 simplifies later code
<- glm(deaths ~ log(BaseDam2014), family=quasipoisson, data=hurric)
hurr.glm plot(hurr.glm, col=adjustcolor('black', alpha=0.4),
cex.caption=0.95, sub.caption=rep("",4), fg="gray")
Negative binomial versus power transformed scale
Fit a negative binomial (NBI) model
library(gamlss)
<- gamlss::gamlss(deaths ~ log(BaseDam2014), family=NBI(),
hurrNB.gamlss data=hurric[-56,])
<- resid(hurrNB.gamlss, what="mu")
mures <- resid(hurrNB.gamlss, what="z-scores") ## equivalent normal quantiles zres
table(sign(mures))
Fit linear model to power transformed response
<- lm(car::yjPower(deaths,-0.2) ~ log(BaseDam2014), data=hurric[-56,])
hurr.lm ## Use the following function to transform from power scale to log scale
<- function(z, lambda)log(lambda*z+1)/lambda
powerTOlog ## Calculate fitted values, and transform to log(deaths+1) scale
<- powerTOlog(predict(hurr.lm), lambda=-0.2)
hatPower <- log(hurric[-56,"deaths"]+1) - hatPower resPower
table(sign(resPower))
Compare NBI and power transform fits with smoothed quantiles
library(qgam, quietly=TRUE)
.8 <- predict(qgam(log(deaths+1) ~ s(log(BaseDam2014)), qu=.648,
hat68data=hurric[-56,]))
.9 <- predict(qgam(log(deaths+1) ~ s(log(BaseDam2014)), qu=.409,
hat40data=hurric[-56,]))
<- log(hurric$BaseDam2014)[-56]
xvar plot(log(deaths+1) ~ log(BaseDam2014), data=hurric, xaxt="n", yaxt="n",
cex=4, pch=".", fg="gray", col=adjustcolor("black",alpha.f=0.65),
xlab="Damage, millions of US$ in 2014", ylab="Deaths")
axis(1, at=log(c(1,10,1000, 100000)),
labels=paste(c(1,10,1000, 100000)), lwd=0, lwd.ticks=1)
axis(2, at=log(c(0,10,100,1000)+1),
labels=paste(c(0,10,100,1000)), lwd=0, lwd.ticks=1)
## Negative binomial regression fitted values
<- fitted(hurrNB.gamlss)
hatNB lines(xvar, log(hatNB+1), col="blue", lty=2)
with(hurric, text(log(BaseDam2014)[56], log(deaths+1)[56], "Audrey", pos=3),
cex=0.72)
## Show fit from power transform model
lines(xvar, hatPower, col="blue", lty=1)
## Show 68.8\% and 40.1\% fits from regression smooths
lines(hat68.8 ~ xvar, lty=2, col='red')
lines(hat40.9 ~ xvar, lty=1, col='red')
legend("topleft", col=rep(c('blue','red'),c(2,2)), lty=rep(2:1,2), cex=0.8,
y.intersp=0.75, legend=c("Negative binomial fit","Power transform fit",
"68.8% quantile", "40.9% quantile"), bty="n")
mtext(side=3, "A: Deaths vs damage", line=0.5, cex=1.15, adj=0)
## Quantile-quantile plot -- negative binomial model
qqnorm(zres, main="", fg="gray", cex=0.5,
col=adjustcolor("black",alpha.f=0.65)); qqline(zres, col=2)
mtext(side=3, "B: Q-Q plot", line=0.5, cex=1.15, adj=0)
## a) Fitted and empirical centiles from hurrNB.gamlss
<- t(centiles.split(hurrNB.gamlss, xvar=log(hurric$BaseDam2014)[-56],
pc cent=c(5,10,25,50,75,90,95), xcut.points=log(c(150, 1500)),
plot=FALSE))
rownames(pc) <- c("up to 150M", "150M to 1500M", "above 1500M")
round(pc,2)
<- gamlss(car::yjPower(deaths, -0.2) ~ log(BaseDam2014), data=hurric) hurrP.gamlss
## Fitted and empirical centiles from hurrP.gamlss
<- t(centiles.split(hurrP.gamlss, xvar=log(hurric$BaseDam2014),
pc cent=c(5,10,25,50,75,90,95),
xcut.points=log(c(150, 1500)), plot=FALSE))
rownames(pc) <- c("up to 150M", "150M to 1500M", "above 1500M")
round(pc,2)
Section 5.5 Fitting smooths
Subsection 5.5.1: Handedness of first-class cricketers in the UK
<- with(DAAG::cricketer, table(left,dead))
tab colnames(tab) <- c('live','dead')
<- cbind(addmargins(tab, margin=2), prop.table(tab, margin=1))
tab tab
library(mgcv)
library(latticeExtra)
::cricketer |> dplyr::count(year, left, name="Freq") -> handYear
DAAGnames(handYear)[2] <- "hand"
<- tidyr::pivot_wider(handYear, names_from='hand', values_from="Freq")
byYear <- gam(cbind(left,right) ~ s(year), data=byYear, family=binomial)
hand.gam <- attr(predict(hand.gam, type='terms'), "constant")
const ## `const` is the mean on the scale of the linear predictor
plot(hand.gam, shift=const, trans=function(x)exp(x)/(1+exp(x)), ylim=c(.05,.4),
xlab="", ylab="Proportion lefhanded", rug=FALSE, fg="gray",
main=list("Proportion lefthanded, with 2SE limits",font=1,cex=1.2))
## Add `const`, then apply inverse link function.
## Plots estimated proportions (i.e., on the scale of the response)
with(byYear, points(year, I(left/(left+right)), cex=0.8, col="gray50"))
<- gam(Freq ~ hand + s(year, by=factor(hand)), data=handYear,
leftrt.gam family=poisson)
<- predict(leftrt.gam, se=T, type='response')
leftrt.pred <- cbind(handYear, as.data.frame(leftrt.pred))
handYear <- DAAG::DAAGtheme(color=T)$superpose.symbol$col[c(2,2,1)]
col2 <- list(space="top", columns=3, lines=list(lty=c(1,2,1), lwd=2, col=col2),
gph.key text=list(c("left",expression(4.4%*%"left"),"right")), cex=1.2)
<- xyplot(leftrt.pred$fit ~ year, groups=hand, ylab=list("Number born", cex=1.2),
gph type="l", xlab="", data=handYear, key=gph.key, col=col2[c(3,1)], lwd=2)
<- xyplot(Freq~year, groups=hand, data=handYear, col=col2[c(3,1)])
gph1 <- xyplot(I(4.4*fit) ~ year, data=subset(handYear, hand=="left"),
gph2 type="l", lty=2, lwd=2, col=col2[2])
update(gph+as.layer(gph1)+as.layer(gph2), par.settings=DAAG::DAAGtheme(color=TRUE),
scales=list(cex=1.2))
Section 5.6 Additional notes on generalized linear models
Subsection 5.6.1: Residuals, and estimating the dispersion
Subsection 5.6.2: Standard errors and \(z\)- or \(t\)-statistics for binomial models
<- factor(LETTERS[1:4])
fac <- c(73, 30, 11, 2)/500
p <- rep(500,4)
n round(signif(coef(summary(glm(p ~ fac, family=binomial, weights=n))), 6), 6)
<- c(0.001,0.002,(1:99)/100,0.998,0.999)
p for(i in 1:3){
<- c("logit", "probit", "cloglog")[i]
link <- make.link(link)$linkfun
fun <- fun(p)
x <- glm(p ~ x, family=binomial(link=link), weights=rep(1000,103))
u if (i==1)
plot(x, hatvalues(u), type="l", ylab="Leverage", xaxt="n", fg='gray',
yaxt="n",
ylim=c(0, 0.0425), yaxs="i", xlab="Fitted proportion") else {
<- predict(u, type="response")
phat lines(log(phat/(1-phat)), hatvalues(u), type="l",
col=c("black","black","gray")[i], lwd=0.75,
lty=c(1,2,1)[i])
}
}=c(0.001,0.002, 0.005, 0.01,0.02,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.98,0.99, 0.995, 0.998, 0.999)
pos<- seq(from=1,to=17, by=2)
sub1 <- seq(from=2,to=16, by=2)
sub3 axis(1, at=log(pos/(1-pos))[sub1], labels=paste(pos)[sub1],
cex.axis=0.7, lwd=0, lwd.ticks=1)
axis(3, at=log(pos/(1-pos))[sub3], labels=paste(pos)[sub3],
cex.axis=0.7, lwd=0, lwd.ticks=1)
axis(2, at=c(0,.01,.02,.03), cex.axis=.7, lwd=0, lwd.ticks=1)
legend("topleft", lty=c(1,2,1),
legend=c("logit link", "probit link", "cloglog link"),
col=c("black","black","gray"), bty="n", cex=0.8)
Section 5.7 Models with an ordered categorical or categorical response
library(VGAM)
<- data.frame(freq=c(99,76,41,55,2,13),
inhaler choice=rep(c("inh1","inh2"), 3),
ease=ordered(rep(c("easy","re-read","unclear"), rep(2,3))))
<- vglm(ease ~ 1, weights=freq, data=inhaler,
inhaler1.vglm cumulative(link="logitlink"), subset=inhaler$choice=="inh1")
<- vglm(ease ~ 1, weights=freq, data=inhaler,
inhaler2.vglm cumulative(link="logitlink"), subset=inhaler$choice=="inh2")
## Inhaler 1
round(coef(summary(inhaler1.vglm)),3)
## Inhaler 2
round(coef(summary(inhaler2.vglm)),3)
<- vglm(ease ~ choice, weights=freq, data=inhaler,
inhaler.vglm cumulative(link="logitlink", parallel=FALSE))
round(coef(summary(inhaler.vglm)),3)
<- vglm(ease ~ choice, weights=freq, data=inhaler,
inhalerP.vglm cumulative(link="logitlink", parallel=TRUE))
round(coef(summary(inhalerP.vglm)),3)
<- predict(inhalerP.vglm, se.fit=TRUE, newdata=inhaler[1:2,])
pred colnames(pred$se.fit) <- paste("SE", colnames(pred$se.fit))
<- with(pred, cbind(fitted.values, se.fit))
fitvals colnames(fitvals) <- gsub('link', '', colnames(fitvals))
round(fitvals, 2)
<- deviance(inhalerP.vglm) - deviance(inhaler.vglm)
d ## Refer to chi-squared distribution with 1 degree of freedom
c(Difference=d, "p-Value"=pchisq(3.416, df=1, lower.tail=FALSE))
Subsection 5.7.2: Loglinear Models
Section 5.8 Survival analysis
suppressMessages(library(survival))
<- data.frame(x0 = c(1, 5, 1, 2, 14, 10, 12, 19)*30,
df x1 = c(46, 58, 85, 67, 17, 85, 18, 42)*30,
fail = c(1, 0, 0, 1, 1, 0, 0, 1))
plot(c(0, 2610), c(0.65, 8.15), type = "n",
xlab = "Days from beginning of study",
ylab = "Subject number", axes = F)
## mtext(side = 1, line = 2.5, "Days from beginning of study", adj = 0.5)
<- dim(df)[1]
m par(las=2)
axis(2, at = (1:m), labels = paste((m:1)), lwd=0, lwd.ticks=1)
par(las=1)
abline(v = 600, lty = 4, col="gray40")
abline(v = 2550)
mtext(side = 3, line = 0.5, at = c(600, 2550),
text = c("\nEnd of recruitment",
"\nEnd of study"), cex = 0.9)
lines(rep((0:8) * 300, rep(3, 9)), rep(c(-0.4, -0.2, NA), 9),
xpd = T)
mtext(side = 1, line = 1.0, at = (0:8) * 300,
text = paste((0:8) * 300), adj = 0.5)
<- par()$cxy[1]
chw <- as.vector(t(cbind(df[, 1], df[, 2] - 0.25 * chw,
xx rep(NA, m))))
<- as.vector(t(cbind(matrix(rep(m:1, 2), ncol = 2),
yy rep(NA, m))))
lines(as.numeric(xx), as.numeric(yy))
points(df[, 1], m:1, pch = 16)
text(df[, 1]-0.25*chw, m:1, paste(df[,1]), pos=1, cex=0.75)
<- as.logical(df$fail)
fail points(df[fail, 2], (m:1)[fail], pch = 15)
points(df[!fail, 2], (m:1)[!fail], pch = 0)
text(df[, 2]+0.25*chw, m:1, paste(df[,2]), pos=1, cex=0.75)
par(xpd=TRUE)
legend(0, 11.5, pch = 16, legend = "Entry", y.intersp=0.15)
legend(1230, 11.5, pch = c(15, 0),
legend = c("Dead", "Censored"), ncol=2, y.intersp=0.15)
Subsection 5.8.1: Analysis of the Aids2 data
str(MASS::Aids2, vec.len=2)
<- subset(MASS::Aids2, T.categ=="blood")
bloodAids $days <- bloodAids$death-bloodAids$diag
bloodAids$dead <- as.integer(bloodAids$status=="D") bloodAids
<- subset(MASS::Aids2, T.categ=="blood")
bloodAids $days <- bloodAids$death-bloodAids$diag
bloodAids$dead <- as.integer(bloodAids$status=="D")
bloodAidsplot(survfit(Surv(days, dead) ~ sex, data=bloodAids),
col=c(2,4), conf.int=TRUE, lty=1, fg="gray",
xlab="Days from diagnosis", ylab="Survival probability")
legend("top", legend=levels(bloodAids$sex), lty=c(1,1),
col=c(2,4), horiz=TRUE, bty="n")
## Pattern of censoring for male homosexuals
<- subset(MASS::Aids2, sex=="M" & T.categ=="hs")
hsaids $days <- hsaids$death-hsaids$diag
hsaids$dead <- as.integer(hsaids$status=="D")
hsaidstable(hsaids$status,hsaids$death==11504)
<- subset(MASS::Aids2, sex=="M" & T.categ=="hs")
hsaids $days <- hsaids$death-hsaids$diag
hsaids$dead <- as.integer(hsaids$status=="D")
hsaids<- survfit(Surv(days, dead) ~ 1, data=hsaids)
hsaids.surv plot(hsaids.surv, col="gray", conf.int=F, tcl=-0.4, fg="gray")
par(new=TRUE)
plot(hsaids.surv,col=1, conf.int=F,mark.time=F, fg="gray",
xlab="Days from diagnosis", ylab="Estimated survival probabality")
<- par()$cxy[1]
chw <- par()$cxy[2]
chh <- hsaids.surv$surv
surv <- c(200,700,1400,1900)
xval <- approx(hsaids.surv$time, surv, xout=xval)$y
hat for(i in 1:2) arrows(xval[i], hat[i], 0, hat[i],
length=0.05, col="gray")
lines(rep(xval[1],2), hat[1:2], col="gray")
## lines(rep(xval[3],2), hat[3:4], col="gray")
## Offset triangle 1
<- par()$cxy[1]
chw lines(xval[c(1,2,1,1)]+650, hat[c(2,2,1,2)]+0.2,col="gray40")
<- c(mean(xval[c(1,1,2)]), mean(hat[c(1,2,2)]))
xy1 arrows(xy1[1], xy1[2], xy1[1]+650, xy1[2]+0.2, col="gray40", length=0.1)
text(xval[1]-0.1*chw+650, hat[1]+0.2,
paste(round(hat[1],2)), col="gray20",cex=0.75, adj=1)
text(xval[1]+650-0.1*chw, hat[2]+0.2,
paste(round(hat[2],2)), col="gray20",cex=0.75, adj=1)
text(mean(xval[1:2])+650, hat[2]+0.2-0.5*chh,
paste(round(diff(xval[1:2]))), col="gray20", cex=0.75)
text(xval[1]+650-0.5*chw, mean(hat[1:2]+0.2), paste(round(hat[1]-hat[2],3)),
srt=90, adj=0.5, col="gray20", cex=0.75)
Subsection 5.8.4: Hazard rates
Subsection 5.8.5: The Cox proportional hazards model
<- coxph(Surv(days, dead) ~ sex, data=bloodAids)
bloodAids.coxph print(summary(bloodAids.coxph), digits=6)
## Add `age` as explanatory variable
<- coxph(Surv(days, dead) ~ sex+age, data=bloodAids) bloodAids.coxph1
<- subset(MASS::Aids2,T.categ=="blood")
bloodAids <- within(bloodAids, {days <- death-diag
bloodAids <- as.integer(status=="D")})
dead <- coxph(Surv(days, dead) ~ sex, data = bloodAids)
bloodAids.coxph plot(cox.zph(bloodAids.coxph), cex=0.75, bty="n")
box(col="gray")
cox.zph(bloodAids.coxph)
<- DAAG::cricketer
cricketer <- coxph(Surv(life, kia) ~ left/poly(year,4),
kia4.coxph data = cricketer, model=T)
<- update(kia4.coxph, . ~ left/poly(year,6),
kia6.coxph data = cricketer, model=T)
# Type `plot(cox.zph(kia6.coxph)` to plot the two graphs
# Perhaps check also `AIC(kia4.coxph, kia6.coxph)`
cox.zph(kia6.coxph)
plot(cox.zph(kia6.coxph), cex=0.75, bty="n")
box(col="gray")
Section 5.9: Transformations for proportions and counts
Section 5.10: Further reading
Exercises (5.11)
5.1
<- rbind(
inhibition conc =c(0.1,0.5, 1,10,20,30,50,70,80,100,150),
no = c(7, 1, 10, 9, 2, 9, 13, 1, 1, 4, 3),
yes = c(0, 0, 3, 4, 0, 6, 7, 0, 0, 1, 7)
)colnames(inhibition) <- rep("", ncol(inhibition))
inhibition
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch5.R")
}