8 Tree-based Classification and Regression
Packages required (plus any dependencies)
DAAG latticeExtra plot rpart rpart.plot MASS ggplot2 car randomForest
Additionally, knitr and Hmisc are required in order to process the Rmd source file.
::knitrSet(basename="treebased", lang='markdown', fig.path="figs/g", w=7, h=7)
Hmisc<- options(digits=4, formatR.arrow=FALSE, width=70, scipen=999)
oldopt library(knitr)
## knitr::render_listings()
'set']](cache.path='cache-', out.width="80%", fig.align="center",
opts_chunk[[fig.show='hold', size="small", ps=10, strip.white = TRUE,
comment=NA, width=70, tidy.opts = list(replace.assign=FALSE))
suppressPackageStartupMessages(library(latticeExtra))
Section 8.1: Tree-based methods — uses and basic notions
Subsection 8.1.1: Detecting email spam~– an initial look
<- c("crl.tot", "dollar", "bang", "money", "n000", "make")
nam <- sample(1:dim(DAAG::spam7)[1],500)
nr <-DAAG::spam7$yesno[nr]
yesno<- DAAG::spam7[nr,c(nam,"yesno")]
spam7a <- as.formula(paste(paste(nam, collapse='+'), '~ yesno'))
formab <- DAAG::DAAGtheme(color = TRUE, pch=3)
spamtheme ::bwplot(formab, data=spam7a, outer=T, horizontal=F, layout=c(7,1),
latticescales=list(relation='free'), ylab="", par.settings=spamtheme,
between=list(x=0.5),
main=list("A: Raw data values", y=1.0, font=1, cex=1.25))
<- cbind(log(spam7a[,-7]+0.001), yesno=spam7a[,7])
spam7b <-c(0.001, 0.001,0.01,0.1,1,10,100,1000,10000)
yval ::bwplot(formab, data=spam7b, outer=T, horizontal=F, layout=c(7,1),
latticescales=list(relation='free',
y=list(at=log(yval+0.001), labels=yval, rot=90)),
ylab="", par.settings=spamtheme, between=list(x=0.5),
main=list(expression("B: Boxplots, using "*log(x+001)*" scale"),
y=1.0, font=1, cex=1.25))
## Obtain 500-row sample; repeat the first plot (of crl.tot)
<- spam7[sample(seq(1,4601), 500, replace=FALSE), ]
spam.sample boxplot(split(spam.sample$crl.tot, spam.sample$yesno))
suppressMessages(library(rpart))
set.seed(31) ## Reproduce tree shown in text
<- rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
spam.rpart method="class", model=TRUE, data=DAAG::spam7)
make, ::rpart.plot(spam.rpart, type=0, under=TRUE, branch.lwd=0.4,
rpart.plotnn.lwd=0.4, box.palette="auto", tweak=1.25)
printcp(spam.rpart, digits=3)
Subsection 8.1.2: Choosing the number of splits
Section 8.2: Splitting criteria, with illustrative examples
<- data.frame(fac = factor(rep(c('f1','f2'), 3)),
tree.df x = rep(1:3, rep(2, 3)), Node = 1:6)
<- rpart(Node ~ fac + x, data = tree.df,
u.tree control = list(minsplit = 2, minbucket = 1, cp = 1e-009))
::rpart.plot(u.tree, type=0, under=TRUE, branch.lwd=0.25,
rpart.plotnn.lwd=0.25, box.palette="Grays", tweak=1.6)
Subsection 8.2.1: Within and between sums of squares
Subsection 8.2.2: Choosing the split~– classification trees
Subsection 8.2.3: Tree-based regression versus loess regression smoothing
<- loess(Mileage~Weight, data = car.test.frame, span = 2/3)
u.lo plot(Mileage~Weight, data=car.test.frame, xlab = "Weight",
ylab = "Miles per gallon", sub = "", fg="gray")
<- with(car.test.frame, loess.smooth(Weight, Mileage))
xy <-order(xy$x)
ordlines(xy$x[ord],xy$y[ord])
## loess fit to Mileage vs Weight: data frame car.test.frame (rpart)
with(rpart::car.test.frame, scatter.smooth(Mileage ~ Weight))
par(fig=c(0, 0.32, 0,1))
set.seed(37)
<- rpart(Mileage ~ Weight, data = car.test.frame)
car.tree ::rpart.plot(car.tree, type=0, under=TRUE,
rpart.plotbox.palette="Grays", tweak=1.05)
par(fig=c(0.3,1, 0,1), new=TRUE)
set.seed(37)
<- rpart(Mileage~Weight, data=car.test.frame, control =
car2.tree list(minsplit = 10, minbucket = 5, cp = 0.0001))
::rpart.plot(car2.tree, type=0, under=TRUE,
rpart.plotbox.palette="auto", tweak=1.05)
## Panel A: Split criteria were left a their defaults
<- rpart(Mileage ~ Weight, data = car.test.frame)
car.tree ::rpart.plot(car.tree, type=0, under=TRUE)
rpart.plot## Panel B: Finer grained splits
<- rpart(Mileage ~ Weight, data=car.test.frame, method="anova",
car2.tree control = list(minsplit = 10, minbucket = 5, cp = 0.0001))
## See `?rpart::rpart.control` for details of control options.
<- data.frame(Weight=seq(from=min(car.test.frame$Weight),
dat to=max(car.test.frame$Weight)))
<- predict(car.tree, newdata=dat)
pred <- predict(car2.tree, newdata=dat)
pred2 <- dat$Weight[c(1,diff(pred)) != 0]
lwr <- dat$Weight[c(diff(pred),1) != 0]
upr <- with(car.test.frame, loess.smooth(Weight, Mileage, evaluation=2011))
xy2 <- xy2$y[c(1,diff(pred)) != 0]
lwrLO <- xy2$y[c(diff(pred),1) != 0]
uprLO round(rbind(lwr,upr,lwrLO,uprLO,
c(diff(pred),1)!=0],pred2[c(diff(pred),1)!=0]),1) pred[
Subsection 8.2.4: Predictive accuracy, and the cost-complexity tradeoff
Subsection 8.2.5: Cross-validation
Subsection 8.2.6: The cost-complexity parameter
vignette("longintro", package="rpart")
Subsection 8.2.7: Prediction error versus tree size
Section 8.3: The practicalities of tree construction – two examples
Subsection 8.3.1: Data for female heart attack patients
<- DAAG::mifem
mifem summary(mifem) # data frame mifem (DAAG)
set.seed(29) # Make results reproducible
<- rpart(outcome ~ ., method="class", data = mifem, cp = 0.0025) mifem.rpart
## Tabular equivalent of Panel A from `plotcp(mifem.rpart)`
printcp(mifem.rpart, digits=3)
cat(c(". . .", capture.output(printcp(mifem.rpart, digits=3))[-(1:9)]),
sep="\n")
plotcp(mifem.rpart, fg="gray", tcl=-0.25)
<- prune(mifem.rpart, cp=0.01) ## Prune tree back to 2 leaves
mifemb.rpart ::rpart.plot(mifemb.rpart, under=TRUE, type=4,
rpart.plotbox.palette=0, tweak=1.0)
Subsection 8.3.2: The one-standard-deviation rule
Subsection 8.3.3: Printed Information on Each Split
print(mifemb.rpart)
set.seed(59)
<- rpart(formula = yesno ~ crl.tot + dollar + bang +
spam7a.rpart + n000 + make, method="class", cp = 0.002,
money model=TRUE, data = DAAG::spam7)
printcp(spam7a.rpart, digits=3)
<- signif(as.data.frame(spam7a.rpart$cptable),3)
cpdf <- which.min(cpdf[,"xerror"])
minRow <- sum(cpdf[minRow, c("xerror","xstd")])
upr <- min((1:minRow)[cpdf[1:minRow,"xerror"]<upr])
takeRow <- cpdf[takeRow, 'nsplit']
newNsplit <- mean(cpdf[c(takeRow-1,takeRow),"CP"]) cpval
<- prune(spam7a.rpart, cp=cpval)
spam7b.rpart ::rpart.plot(spam7b.rpart, under=TRUE, box.palette="Grays", tweak=1.65) rpart.plot
How does the one standard error rule affect accuracy of estimates?
requireNamespace('randomForest', quietly=TRUE)
::compareTreecalcs(data=DAAG::spam7, fun="rpart") DAAG
<- matrix(0, nrow=100, ncol=6)
acctree.mat <- DAAG::spam7
spam7 for(i in 1:100)
<- DAAG::compareTreecalcs(data=spam7, fun="rpart") acctree.mat[i,]
Section 8.4: From one tree to a forest – a more global optimality
suppressPackageStartupMessages(library(randomForest))
<- randomForest(yesno ~ ., data=spam7, importance=TRUE)
spam7.rf spam7.rf
<- tuneRF(x=spam7[, -7], y=spam7$yesno, plot=FALSE) z
<- t(z[,2,drop=F])
zdash colnames(zdash) <- paste0(c("mtry=",rep("",2)), z[,1])
round(zdash,3)
importance(spam7.rf)
Subsection 8.4.1: Prior probabilities
<- MASS::Pima.tr
Pima.tr table(Pima.tr$type)
set.seed(41) # This seed should reproduce the result given here
<- randomForest(type ~ ., data=Pima.tr)
Pima.rf ## The above is equivalent to:
## Pima.rf <- randomForest(type ~ ., data=Pima.tr, sampsize=200)
round(Pima.rf$confusion,3)
<- prop.table(table(Pima.tr$type)) tab
<- randomForest(type ~ ., data=Pima.tr, sampsize=c(68,68)) Pima.rf
<- randomForest(type ~ ., data=Pima.tr, sampsize=c(132,68)) Pima.rf
Subsection 8.4.2: A low-dimensional representation of observations
Subsection 8.4.3: Models with a complex error structure
Section 8.5: Additional notes – one tree, or many trees?
Subsection 8.5.1: Differences between rpart() and randomForest()
Error rates – rpart() versus randomForest()
## Accuracy comparisons
<- matrix(0, nrow=100, ncol=8)
acctree.mat colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfOOBI", "rftest")
for(i in 1:100)acctree.mat[i,] <- DAAG::compareTreecalcs(data=spam7, cp=0.0004,
fun=c("rpart", "randomForest"))
<- data.frame(acctree.mat)
acctree.df <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE)
lims <- adjustcolor("black", alpha.f=0.75)
cthrublack # Panel A
plot(rfOOBI ~ rftest, data=acctree.df, xlab="Error rate - subset II", xlim=lims,
ylim=lims, ylab="OOB Error - fit to subset I", col=cthrublack, fg="gray")
abline(0,1)
mtext(side=3, line=0.5, "A", adj=0)
# Panel B
plot(rptest ~ rftest, data=acctree.df, xlab="Error rate - subset II",
ylab="rpart Error rate, subset II", xlim=lims, ylim=lims,
col=cthrublack, fg="gray")
abline(0,1)
mtext(side=3, line=0.5, "B", adj=0)
<- matrix(0, nrow=100, ncol=8)
acctree.mat colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfcvI", "rftest")
for(i in 1:100)acctree.mat[i,] <- DAAG::compareTreecalcs(data=spam7,
fun=c("rpart", "randomForest"))
## Panel A: Plot `rfOOBI` against `rftest`
## Panel B: Plot `rptest` against `rftest`
Subsection 8.5.2: Tree-based methods, versus other approaches
Subsection 8.5.3: Further notes
Section 8.6: Further reading and extensions
Exercises (8.7)
8.5
sapply(MASS::biopsy, function(x)sum(is.na(x))) ## Will omit rows with NAs
<- na.omit(MASS::biopsy)[,-1] ## Omit also column 1 (IDs)
biops ## Examine list element names in randomForest object
names(randomForest(class ~ ., data=biops))
8.5a
## Repeated runs, note variation in OOB accuracy.
for(i in 1:10) {
<- randomForest(class ~ ., data=biops)
biops.rf <- mean(biops.rf$err.rate[,"OOB"])
OOBerr print(paste(i, ": ", round(OOBerr, 4), sep=""))
print(round(biops.rf$confusion,4))
}
8.5b
## Repeated train/test splits: OOB accuracy vs test set accuracy.
for(i in 1:10){
<- sample(1:dim(biops)[1], size=round(dim(biops)[1]/2))
trRows <- randomForest(class ~ ., data=biops[trRows, ],
biops.rf xtest=biops[-trRows,-10], ytest=biops[-trRows,10])
<- mean(biops.rf$err.rate[,"OOB"])
oobErr <- mean(biops.rf$test$err.rate[,"Test"])
testErr print(round(c(oobErr,testErr),4))
}
8.5c
randomForest(class ~ ., data=biops, xtest=biops[,-10], ytest=biops[,10])
8.7
## Run model on total data
randomForest(as.factor(type) ~ ., data=Pima.tr)
<- sample(dim(Pima.tr)[1], replace=TRUE)
rowsamp randomForest(as.factor(type) ~ ., data=Pima.tr[rowsamp, ])
8.8a
<- ggplot2::diamonds[sample(1:nrow(ggplot2::diamonds), 500),]
d500 unlist(sapply(d500, class)) # Check the class of the 10 columns
::spm(d500) # If screen space is limited do two plots, thus:
car# 1) variables 1 to 5 and 7 (`price`); 2) variables 6 to 10
plot(density(d500[, "price", drop = T])) # Distribution is highly skew
::boxcox(price~., data=ggplot2::diamonds) # Suggests log transformation MASS
8.8b
<- ggplot2::diamonds; Y <- diamonds[,"price", drop=T]
diamonds library(rpart)
<- rpart(log(Y) ~ ., data=diamonds[,-7], cp=5e-7) # Complex tree
d7.rpart <- prune(d7.rpart, cp=0.0025)
d.rpart printcp(d.rpart) # Relative to `d7.rpart`, simpler and less accurate
<- which.min(d7.rpart$cptable[,'xerror'])
nmin <- prune(d7.rpart, cp=d7.rpart$cptable[nmin,'CP'])
dOpt.rpart print(dOpt.rpart$cptable[nmin])
<- dOpt.rpart$cptable[c(nrow(d.rpart$cptable),nmin), "xerror"])
(xerror12 ## Subtract from 1.0 to obtain R-squared statistics
rbind("d.rpart"=d.rpart[['variable.importance']],
"dOpt.rpart"=dOpt.rpart[['variable.importance']]) |>
100*apply(x,1,function(x)x/sum(x)))() |> round(1) |> t() (\(x)
8.9
<- ggplot2::diamonds[,"price", drop=T]
Y <- sample(1:nrow(diamonds), size=5000)
samp5K <- randomForest(x=diamonds[samp5K,-7], y=log(Y[samp5K]),
(diamond5K.rf xtest=diamonds[-samp5K,-7], ytest=log(Y[-samp5K])))
## Omit arguments `xtest` and `ytest` if calculations take too long
sort(importance(diamond5K.rf)[,1], decreasing=T) |>
100*x/sum(x))() |> round(1) |> t() (\(x)
8.9a
<- randomForest(x=diamonds[samp5K,-7], y=Y[samp5K],
(diamond5KU.rf xtest=diamonds[-samp5K,-7], ytest=Y[-samp5K]))
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch8.R")
}