8 Tree-based Classification and Regression
Packages required (plus any dependencies)
DAAG latticeExtra plot rpart rpart.plot MASS ggplot2 car randomForest
Additionally, knitr and Hmisc are required in order to process the Rmd source file.
Hmisc::knitrSet(basename="treebased", lang='markdown', fig.path="figs/g", w=7, h=7)
oldopt <- options(digits=4, formatR.arrow=FALSE, width=70, scipen=999)
library(knitr)
## knitr::render_listings()
opts_chunk[['set']](cache.path='cache-', out.width="80%", fig.align="center",
fig.show='hold', size="small", ps=10, strip.white = TRUE,
comment=NA, width=70, tidy.opts = list(replace.assign=FALSE))suppressPackageStartupMessages(library(latticeExtra))Section 8.1: Tree-based methods — uses and basic notions
Subsection 8.1.1: Detecting email spam~– an initial look
nam <- c("crl.tot", "dollar", "bang", "money", "n000", "make")
nr <- sample(1:dim(DAAG::spam7)[1],500)
yesno<-DAAG::spam7$yesno[nr]
spam7a <- DAAG::spam7[nr,c(nam,"yesno")]
formab <- as.formula(paste(paste(nam, collapse='+'), '~ yesno'))
spamtheme <- DAAG::DAAGtheme(color = TRUE, pch=3)
lattice::bwplot(formab, data=spam7a, outer=T, horizontal=F, layout=c(7,1),
scales=list(relation='free'), ylab="", par.settings=spamtheme,
between=list(x=0.5),
main=list("A: Raw data values", y=1.0, font=1, cex=1.25))
spam7b <- cbind(log(spam7a[,-7]+0.001), yesno=spam7a[,7])
yval <-c(0.001, 0.001,0.01,0.1,1,10,100,1000,10000)
lattice::bwplot(formab, data=spam7b, outer=T, horizontal=F, layout=c(7,1),
scales=list(relation='free',
y=list(at=log(yval+0.001), labels=yval, rot=90)),
ylab="", par.settings=spamtheme, between=list(x=0.5),
main=list(expression("B: Boxplots, using "*log(x+001)*" scale"),
y=1.0, font=1, cex=1.25))## Obtain 500-row sample; repeat the first plot (of crl.tot)
spam.sample <- spam7[sample(seq(1,4601), 500, replace=FALSE), ]
boxplot(split(spam.sample$crl.tot, spam.sample$yesno))suppressMessages(library(rpart))
set.seed(31) ## Reproduce tree shown in text
spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
make, method="class", model=TRUE, data=DAAG::spam7)
rpart.plot::rpart.plot(spam.rpart, type=0, under=TRUE, branch.lwd=0.4,
nn.lwd=0.4, box.palette="auto", tweak=1.25)printcp(spam.rpart, digits=3)Subsection 8.1.2: Choosing the number of splits
Section 8.2: Splitting criteria, with illustrative examples
tree.df <- data.frame(fac = factor(rep(c('f1','f2'), 3)),
x = rep(1:3, rep(2, 3)), Node = 1:6)
u.tree <- rpart(Node ~ fac + x, data = tree.df,
control = list(minsplit = 2, minbucket = 1, cp = 1e-009))
rpart.plot::rpart.plot(u.tree, type=0, under=TRUE, branch.lwd=0.25,
nn.lwd=0.25, box.palette="Grays", tweak=1.6)Subsection 8.2.1: Within and between sums of squares
Subsection 8.2.2: Choosing the split~– classification trees
Subsection 8.2.3: Tree-based regression versus loess regression smoothing
u.lo <- loess(Mileage~Weight, data = car.test.frame, span = 2/3)
plot(Mileage~Weight, data=car.test.frame, xlab = "Weight",
ylab = "Miles per gallon", sub = "", fg="gray")
xy <- with(car.test.frame, loess.smooth(Weight, Mileage))
ord<-order(xy$x)
lines(xy$x[ord],xy$y[ord])## loess fit to Mileage vs Weight: data frame car.test.frame (rpart)
with(rpart::car.test.frame, scatter.smooth(Mileage ~ Weight))par(fig=c(0, 0.32, 0,1))
set.seed(37)
car.tree <- rpart(Mileage ~ Weight, data = car.test.frame)
rpart.plot::rpart.plot(car.tree, type=0, under=TRUE,
box.palette="Grays", tweak=1.05)
par(fig=c(0.3,1, 0,1), new=TRUE)
set.seed(37)
car2.tree <- rpart(Mileage~Weight, data=car.test.frame, control =
list(minsplit = 10, minbucket = 5, cp = 0.0001))
rpart.plot::rpart.plot(car2.tree, type=0, under=TRUE,
box.palette="auto", tweak=1.05)## Panel A: Split criteria were left a their defaults
car.tree <- rpart(Mileage ~ Weight, data = car.test.frame)
rpart.plot::rpart.plot(car.tree, type=0, under=TRUE)
## Panel B: Finer grained splits
car2.tree <- rpart(Mileage ~ Weight, data=car.test.frame, method="anova",
control = list(minsplit = 10, minbucket = 5, cp = 0.0001))
## See `?rpart::rpart.control` for details of control options.dat <- data.frame(Weight=seq(from=min(car.test.frame$Weight),
to=max(car.test.frame$Weight)))
pred <- predict(car.tree, newdata=dat)
pred2 <- predict(car2.tree, newdata=dat)
lwr <- dat$Weight[c(1,diff(pred)) != 0]
upr <- dat$Weight[c(diff(pred),1) != 0]
xy2 <- with(car.test.frame, loess.smooth(Weight, Mileage, evaluation=2011))
lwrLO <- xy2$y[c(1,diff(pred)) != 0]
uprLO <- xy2$y[c(diff(pred),1) != 0]
round(rbind(lwr,upr,lwrLO,uprLO,
pred[c(diff(pred),1)!=0],pred2[c(diff(pred),1)!=0]),1)Subsection 8.2.4: Predictive accuracy, and the cost-complexity tradeoff
Subsection 8.2.5: Cross-validation
Subsection 8.2.6: The cost-complexity parameter
vignette("longintro", package="rpart")Subsection 8.2.7: Prediction error versus tree size
Section 8.3: The practicalities of tree construction – two examples
Subsection 8.3.1: Data for female heart attack patients
mifem <- DAAG::mifem
summary(mifem) # data frame mifem (DAAG)set.seed(29) # Make results reproducible
mifem.rpart <- rpart(outcome ~ ., method="class", data = mifem, cp = 0.0025)## Tabular equivalent of Panel A from `plotcp(mifem.rpart)`
printcp(mifem.rpart, digits=3)cat(c(". . .", capture.output(printcp(mifem.rpart, digits=3))[-(1:9)]),
sep="\n")plotcp(mifem.rpart, fg="gray", tcl=-0.25)
mifemb.rpart <- prune(mifem.rpart, cp=0.01) ## Prune tree back to 2 leaves
rpart.plot::rpart.plot(mifemb.rpart, under=TRUE, type=4,
box.palette=0, tweak=1.0)Subsection 8.3.2: The one-standard-deviation rule
Subsection 8.3.3: Printed Information on Each Split
print(mifemb.rpart)set.seed(59)
spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang +
money + n000 + make, method="class", cp = 0.002,
model=TRUE, data = DAAG::spam7)printcp(spam7a.rpart, digits=3)cpdf <- signif(as.data.frame(spam7a.rpart$cptable),3)
minRow <- which.min(cpdf[,"xerror"])
upr <- sum(cpdf[minRow, c("xerror","xstd")])
takeRow <- min((1:minRow)[cpdf[1:minRow,"xerror"]<upr])
newNsplit <- cpdf[takeRow, 'nsplit']
cpval <- mean(cpdf[c(takeRow-1,takeRow),"CP"])spam7b.rpart <- prune(spam7a.rpart, cp=cpval)
rpart.plot::rpart.plot(spam7b.rpart, under=TRUE, box.palette="Grays", tweak=1.65)How does the one standard error rule affect accuracy of estimates?
requireNamespace('randomForest', quietly=TRUE)
DAAG::compareTreecalcs(data=DAAG::spam7, fun="rpart")acctree.mat <- matrix(0, nrow=100, ncol=6)
spam7 <- DAAG::spam7
for(i in 1:100)
acctree.mat[i,] <- DAAG::compareTreecalcs(data=spam7, fun="rpart")Section 8.4: From one tree to a forest – a more global optimality
suppressPackageStartupMessages(library(randomForest))
spam7.rf <- randomForest(yesno ~ ., data=spam7, importance=TRUE)
spam7.rfz <- tuneRF(x=spam7[, -7], y=spam7$yesno, plot=FALSE)zdash <- t(z[,2,drop=F])
colnames(zdash) <- paste0(c("mtry=",rep("",2)), z[,1])
round(zdash,3)importance(spam7.rf)Subsection 8.4.1: Prior probabilities
Pima.tr <- MASS::Pima.tr
table(Pima.tr$type)set.seed(41) # This seed should reproduce the result given here
Pima.rf <- randomForest(type ~ ., data=Pima.tr)
## The above is equivalent to:
## Pima.rf <- randomForest(type ~ ., data=Pima.tr, sampsize=200)
round(Pima.rf$confusion,3)tab <- prop.table(table(Pima.tr$type))Pima.rf <- randomForest(type ~ ., data=Pima.tr, sampsize=c(68,68))Pima.rf <- randomForest(type ~ ., data=Pima.tr, sampsize=c(132,68))Subsection 8.4.2: A low-dimensional representation of observations
Subsection 8.4.3: Models with a complex error structure
Section 8.5: Additional notes – one tree, or many trees?
Subsection 8.5.1: Differences between rpart() and randomForest()
Error rates – rpart() versus randomForest()
## Accuracy comparisons
acctree.mat <- matrix(0, nrow=100, ncol=8)
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfOOBI", "rftest")
for(i in 1:100)acctree.mat[i,] <- DAAG::compareTreecalcs(data=spam7, cp=0.0004,
fun=c("rpart", "randomForest"))
acctree.df <- data.frame(acctree.mat)
lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE)
cthrublack <- adjustcolor("black", alpha.f=0.75)
# Panel A
plot(rfOOBI ~ rftest, data=acctree.df, xlab="Error rate - subset II", xlim=lims,
ylim=lims, ylab="OOB Error - fit to subset I", col=cthrublack, fg="gray")
abline(0,1)
mtext(side=3, line=0.5, "A", adj=0)
# Panel B
plot(rptest ~ rftest, data=acctree.df, xlab="Error rate - subset II",
ylab="rpart Error rate, subset II", xlim=lims, ylim=lims,
col=cthrublack, fg="gray")
abline(0,1)
mtext(side=3, line=0.5, "B", adj=0)acctree.mat <- matrix(0, nrow=100, ncol=8)
colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest",
"n.SErule", "nre.min.12", "rfcvI", "rftest")
for(i in 1:100)acctree.mat[i,] <- DAAG::compareTreecalcs(data=spam7,
fun=c("rpart", "randomForest"))
## Panel A: Plot `rfOOBI` against `rftest`
## Panel B: Plot `rptest` against `rftest`Subsection 8.5.2: Tree-based methods, versus other approaches
Subsection 8.5.3: Further notes
Section 8.6: Further reading and extensions
Exercises (8.7)
8.5
sapply(MASS::biopsy, function(x)sum(is.na(x))) ## Will omit rows with NAs
biops <- na.omit(MASS::biopsy)[,-1] ## Omit also column 1 (IDs)
## Examine list element names in randomForest object
names(randomForest(class ~ ., data=biops))8.5a
## Repeated runs, note variation in OOB accuracy.
for(i in 1:10) {
biops.rf <- randomForest(class ~ ., data=biops)
OOBerr <- mean(biops.rf$err.rate[,"OOB"])
print(paste(i, ": ", round(OOBerr, 4), sep=""))
print(round(biops.rf$confusion,4))
}8.5b
## Repeated train/test splits: OOB accuracy vs test set accuracy.
for(i in 1:10){
trRows <- sample(1:dim(biops)[1], size=round(dim(biops)[1]/2))
biops.rf <- randomForest(class ~ ., data=biops[trRows, ],
xtest=biops[-trRows,-10], ytest=biops[-trRows,10])
oobErr <- mean(biops.rf$err.rate[,"OOB"])
testErr <- mean(biops.rf$test$err.rate[,"Test"])
print(round(c(oobErr,testErr),4))
}8.5c
randomForest(class ~ ., data=biops, xtest=biops[,-10], ytest=biops[,10])8.7
## Run model on total data
randomForest(as.factor(type) ~ ., data=Pima.tr)
rowsamp <- sample(dim(Pima.tr)[1], replace=TRUE)
randomForest(as.factor(type) ~ ., data=Pima.tr[rowsamp, ])8.8a
d500 <- ggplot2::diamonds[sample(1:nrow(ggplot2::diamonds), 500),]
unlist(sapply(d500, class)) # Check the class of the 10 columns
car::spm(d500) # If screen space is limited do two plots, thus:
# 1) variables 1 to 5 and 7 (`price`); 2) variables 6 to 10
plot(density(d500[, "price", drop = T])) # Distribution is highly skew
MASS::boxcox(price~., data=ggplot2::diamonds) # Suggests log transformation8.8b
diamonds <- ggplot2::diamonds; Y <- diamonds[,"price", drop=T]
library(rpart)
d7.rpart <- rpart(log(Y) ~ ., data=diamonds[,-7], cp=5e-7) # Complex tree
d.rpart <- prune(d7.rpart, cp=0.0025)
printcp(d.rpart) # Relative to `d7.rpart`, simpler and less accurate
nmin <- which.min(d7.rpart$cptable[,'xerror'])
dOpt.rpart <- prune(d7.rpart, cp=d7.rpart$cptable[nmin,'CP'])
print(dOpt.rpart$cptable[nmin])
(xerror12 <- dOpt.rpart$cptable[c(nrow(d.rpart$cptable),nmin), "xerror"])
## Subtract from 1.0 to obtain R-squared statisticsrbind("d.rpart"=d.rpart[['variable.importance']],
"dOpt.rpart"=dOpt.rpart[['variable.importance']]) |>
(\(x)100*apply(x,1,function(x)x/sum(x)))() |> round(1) |> t()8.9
Y <- ggplot2::diamonds[,"price", drop=T]
samp5K <- sample(1:nrow(diamonds), size=5000)
(diamond5K.rf <- randomForest(x=diamonds[samp5K,-7], y=log(Y[samp5K]),
xtest=diamonds[-samp5K,-7], ytest=log(Y[-samp5K])))
## Omit arguments `xtest` and `ytest` if calculations take too longsort(importance(diamond5K.rf)[,1], decreasing=T) |>
(\(x)100*x/sum(x))() |> round(1) |> t()8.9a
(diamond5KU.rf <- randomForest(x=diamonds[samp5K,-7], y=Y[samp5K],
xtest=diamonds[-samp5K,-7], ytest=Y[-samp5K]))if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
code <- knitr::knit_code$get()
txt <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch8.R")
}