9 Multivariate data exploration and discrimination
Packages required (plus any dependencies)
Packages used are: DAAG MASS RColorBrewer teigen BiocManager DAAGbio hddplot lmtest splines cobalt mice datasets car micemd oz randomForest ggplot2 latticeExtra mvtnorm teigen limma hddplot mgcv MatchIt sandwich gridExtra DAAGbio mlbench (exercise).
Additionally, knitr and Hmisc are required in order to process the Rmd source file.
::knitrSet(basename="mva", lang='markdown', fig.path="figs/g", w=7, h=7)
Hmisc<- options(digits=4, formatR.arrow=FALSE, width=70, scipen=999)
oldopt library(knitr)
'set']](cache.path='cache-', out.width="80%", fig.align="center",
opts_chunk[[fig.show='hold', ps=10, strip.white = TRUE,
comment=NA, width=70, tidy.opts = list(replace.assign=FALSE))
Section 9.1: Multivariate exploratory data analysis
## Make the lattice package and the possum dataset available
library(latticeExtra)
<- DAAG::possum possum
Subsection 9.1.1: Scatterplot matrices
## Colors distinguish sexes; symbols distinguish sites
<- row.names(DAAG::possumsites)[c(1,2,4:6,3,7)]
sitenames <- list(points = list(pch=0:6), text=list(sitenames),
key columns=4, between=1, between.columns=2)
<- c("red","blue")
colr <- c("tail\nlength","foot\nlength", "conch\nlength")
vnames <- with(possum, splom(~ possum[, 9:11], pch=(0:6)[site], col=colr[sex],
gphA xlab="", varnames=vnames, key=key, axis.line.tck=0.6))
<- with(possum, cloud(earconch~taill+footlgth, data=possum,
gphB col=colr[sex], key=key, pch = (0:6)[site],
zlab=list("earconch", rot=90), zoom=0.925))
update(c("A: Scatterplot matrix"=gphA, "B: Cloud plot"=gphB),
between=list(x=1))
Subsection 9.1.2: Principal components analysis
Preliminary data scrutiny
## Ratios of largest to smallest values: possum[, 6:14] (DAAG)
<- DAAG::possum
possum sapply(na.omit(possum[, 6:14]), function(x)round(max(x)/min(x),2))
## Principal components calculations: possum[, 6:14] (DAAG)
<- complete.cases(possum[, 6:14])
here <- prcomp(log(possum[here, 6:14]))
possum.prc <- cbind(predict(possum.prc), possum[here, c('sex', 'site')]) scores
## For parset, key and colr; see code for Fig 9.1
<- c(3,4,0,8,2,10,1)
pchr <- list(fontsize=list(text=10, points=6), cex=0.75, pch=pchr, alpha=0.8)
parset <- modifyList(key, list(columns=1, space="right"))
key <- with(scores, xyplot(PC2 ~ PC1, aspect="iso", key = key,
gph col = colr[sex], pch = (0:6)[site]))
update(gph, scales=list(tck=0.5), par.settings=parset,
xlab="1st Principal Component", ylab="2nd Principal Component")
print(summary(possum.prc),digits=2)
cat("\nRotations (otherwise called Loadings)\n")
print(possum.prc$rotation, digits=2)
## By default, blanks are shown for loadings < 0.1 in magnitude
The stability of the principal components plot
suppressPackageStartupMessages(library(ggplot2))
theme_set(theme_gray(base_size=8))
## Bootstrap principal components calculations: possum (DAAG)
## Sample from rows where there are no missing values
<- (1:nrow(possum))[complete.cases(possum[, 6:14])]
rowsfrom <- log(possum[rowsfrom, 6:14])
logpossum6to14 <- possum[rowsfrom, c("sex","Pop")]
sexPop <- length(rowsfrom); ntimes <- 3
n <- data.frame(scores1=numeric(ntimes*n), scores2=numeric(ntimes*n))
bootscores for (i in 1:ntimes){
<- sample(1:n, n, replace=TRUE)
samprows *(i-1)+(1:n), 1:2] <-
bootscores[nprcomp(logpossum6to14[samprows, ])$x[, 1:2]
}c("sex","Pop")] <- sexPop[samprows, ]
bootscores[, $sampleID <- factor(rep(1:ntimes, rep(n,ntimes)))
bootscores<- quickplot(x=scores1, y=scores2, colour=sex, size=I(1.0),
gph asp=1, shape=Pop, facets=.~sampleID, data=bootscores) +
scale_shape_discrete(solid=F)
+ scale_colour_manual(values=c("m"="blue","f"="red")) +
gph xlab("First Principal Component") + ylab("Second Principal Component")
Subsection 9.1.3: Multi-dimensional scaling
Ordination
## Code that will display individual graphs
<- dist(possum[,6:14]) # Euclidean distance matrix
d.possum ::sammon(d.possum, k=2, trace=FALSE)$points |> as.data.frame() |>
MASSsetNames(paste0("ord",1:2)) |> cbind(Pop=DAAG::possum$Pop) -> sammon.possum
::isoMDS(d.possum, k=2, trace=FALSE) |> as.data.frame() |>
MASSsetNames(paste0("ord",1:2)) |> cbind(Pop=DAAG::possum$Pop) -> mds.possum
<- xyplot(ord2~ord1, groups=Pop, aspect="iso", data=sammon.possum)
gph1 <- xyplot(ord2~ord1, groups=Pop, aspect="iso", data=mds.possum)
gph2 update(c(gph1, gph2, layout=c(2,1)),
par.settings=simpleTheme(pch=c(1,3)),
between=list(x=0.5), auto.key=list(columns=2),
strip=strip.custom(factor.levels=c("A: Sammon","B: ISOmds")))
Section 9.2: Principal component scores in regression
## Principal components: data frame socsupport (DAAG)
<- DAAG::socsupport
socsupport <- prcomp(as.matrix(na.omit(socsupport[, 9:19])), retx=TRUE, scale=TRUE) ss.pr1
<- par(fg='gray40',col.axis='gray20',lwd=0.5,col.lab='gray20')
oldpar pairs(ss.pr1$x[, 1:3], col='gray40', gap=0.2)
par(oldpar)
summary(sort(ss.pr1$rotation[,1]))
## Note the very large maximum value
which.max(ss.pr1$x[,1])
## Try also boxplot(ss.pr1$x[,1])
## ss.pr1$x["36",1] ## Check that this returns 42
<- complete.cases(socsupport[, 9:19])
use 36] <- FALSE
use[<- prcomp(as.matrix(socsupport[use, 9:19])) ss.pr
## Output from summary()
print(summary(ss.pr), digits=1) # Compare contributions
<- as.data.frame(ss.pr$x[,1:6])
comp <- lm(socsupport[use, "BDI"] ~ ., data=comp)
ss.lm signif(round(coef(summary(ss.lm)),5), digits=3)
print(ss.pr$rotation[, 1], digits=2)
## Plot BDI against first principal component score
<- xyplot(BDI ~ ss.pr$x[ ,1], groups=gender, data=socsupport[use,],
gph par.settings=simpleTheme(pch=1:2), auto.key=list(columns=2))
<- list(pch=c(1,3), list(text=9, points=5))
bw9 update(gph, scales=list(tck=0.5), par.settings=bw9,
xlab ="1st principal component")
Section 9.3: Cluster analysis
Subsection 9.3.1: Hierarchical Clustering
library(mvtnorm)
<- function(n=6, d1=4, d2=4, sigs=c(1, 1, 1, 1), seed=NULL){
makeClust if(!is.null(seed))set.seed(seed)
<- rmvnorm(n, mean = c(-d1,d2), sigma=sigs[1]*diag(2))
g1 <- rmvnorm(n, mean = c(d1,d2), sigma=sigs[2]*diag(2))
g2 <- rmvnorm(n, mean = c(-d1,-d2), sigma=sigs[3]*diag(2))
g3 <- rmvnorm(n, mean = c(d1,-d2), sigma=sigs[4]*diag(2))
g4 rbind(g1,g2,g3,g4)
}
## Code for the plots
<- makeClust(seed=35151)
datA <- makeClust(d2=16, seed=35151)
datB <- makeClust(d1=2,d2=2, seed=35151)
datC plot(datA, xlab="X1", ylab="X2", fg="gray")
title(main="A: 4blobsA", adj=0, line=0.5, font.main=1)
## Repeat previous two lines for datB and datC
plot(datB, xlab="X1", ylab="X2", fg="gray")
title(main="B: 4blobsB", adj=0, line=0.5, font.main=1)
plot(datC, xlab="X1", ylab="X2", fg="gray")
title(main="C: 4blobsC", adj=0, line=0.5, font.main=1)
## Possible alternative
<- c('Equidistant blobs', 'Pulled vertically', 'Closer centers')
config <- cbind(as.data.frame(rbind(datA, datB, datC)),
dat123 gp=factor(rep(1:3, rep(6*4,3)), labels=config))
xyplot(V2 ~ V1 | gp, data=dat123, scales=list(relation='free'),
strip=strip.custom(factor.levels=config), between=list(x=0.5),
par.settings=DAAG::DAAGtheme(color=F))
## Code for single linkage plots: `?plot.hclust` gives help for the plot method
<- hclust(dist(datA), method="single")
clusres_sing par(fig=c(0,0.75,0,1))
plot(clusres_sing, sub="", xlab="", ylab="Distance joined", adj=0.5,
main="", fg="gray")
mtext('A: Single linkage cluster dendrogram, for 4blobsA layout', side=3, adj=0,
font=1, line=1, cex=1.15)
par(fig=c(0.72,1,0,1), new=TRUE)
<- cutree(clusres_sing, 4)
membs = RColorBrewer::brewer.pal(4,'Set1')
col4plot(datA, xlab="X1", ylab="X2", col=col4[membs], fg='gray', pch=membs+1)
mtext('B: 4blobsA, by color', side=3, adj=1.0, font=1, cex=1.15, line=1)
## To see plots from 'average' and 'complete' linkage methods,do:
# plot(hclust(dist(datB), method="average"))
# plot(hclust(dist(datC), method="complete"))
## Dendrograms from data where blobs were pulled vertically
## Follow each use of `hclust()` with a `plot()` command
<- hclust(dist(datB), method="single")
sclusres_sing plot(sclusres_sing, sub="", xlab="", ylab="Distance Joined", main="")
title(main='A: Single linkage, blobs pulled vertically (4blobsB)',
adj=0, font.main=1)
<- hclust(dist(scale(datA)), method="single")
sclusres_sing_s plot(sclusres_sing_s, sub="", xlab="", ylab="Distance Joined", main="")
title(main='B: Single linkage, (4blobsB, rescaled to variance 1)',
adj=0, font.main=1)
# sclusres_avg_s <- hclust(dist(scale(datB)), method="average")
# #plot(sclusres_avg_s, sub="", xlab="", ylab="")
# sclusres_comp_s <- hclust(dist(scale(datB)), method="complete")
# #plot(sclusres_comp_s, sub="", xlab="", ylab="")
## Code. Follow each use of `hclust()` with a `plot()` command
<- hclust(dist(datC), method="single")
clusres_sing2 plot(clusres_sing2, sub="", xlab="", ylab="", cex=1.25, cex.main=1.65,
main="A: Single linkage, closer clusters (4blobsC)", adj=0, font.main=1)
<- hclust(dist(datC), method="average")
clusres_avg2 plot(clusres_avg2, sub="", xlab="", ylab="", cex=1.25, cex.main=1.65,
main="B: Average linkage, closer clusters (4blobsC)", adj=0, font.main=1)
<- hclust(dist(datC), method="complete")
clusres_comp2 plot(clusres_comp2, sub="", xlab="", ylab="", cex=1.25, cex.main=1.65,
main="C: Complete linkage, closer clusters (4blobsC)", adj=0, font.main=1)
Subsection 9.3.2: \(k\)-Means Clustering
set.seed(35151)
<- makeClust(n=100, d1=5, d2=5, sigs=c(.5, .5, 6, 6))
kdat plot(kdat, xlab="X1", ylab="X2", fg="gray")
<- kmeans(kdat, 4, nstart=30)
kmres plot(kdat, col=rainbow(4)[kmres$cluster], pch=kmres$cluster+1,
xlab="X1", ylab="X2", fg="gray")
Subsection 9.3.3: Mixture model-based clustering
## Code
<- function(taus=c(.5, .5), means=c(10,15), sds=c(3,1), xlims=c(0,20)){
plotMix2 curve(taus[1]*dnorm(x, mean=means[1], sd=sds[1]) +
2]*dnorm(x, mean=means[2], sd=sds[2]),
taus[from=xlims[1], to=xlims[2], ylab="Density", fg="gray")
curve(taus[1]*dnorm(x, mean=means[1], sd=sds[1]),
from=xlims[1], to=xlims[2], col="red", lty=2, add=TRUE, fg="gray")
curve(taus[2]*dnorm(x, mean=means[2], sd=sds[2]),
from=xlims[1], to=xlims[2], col="blue", lty=3, add=TRUE, fg="gray")
}plotMix2(taus=c(.2, .8))
plotMix2(taus=c(.5, .5))
plotMix2(taus=c(.9, .1))
library(teigen)
<- na.omit(DAAG::possum[,c(3,9:11)])
possml set.seed(513451)
<- teigen(possml[,2:4], models="UUUU", gauss=TRUE, verbose=FALSE,
gaus_fit scale=FALSE)
## BIC values are plotted against number of groups
$allbic
gaus_fitplot(gaus_fit$allbic, type="b", ylab="", xlab="Number of Groups", fg="gray")
mtext(side=2, line=3.5, "BIC", las=0)
axis(1, at=1:9, fg="gray")
table(possml$Pop, gaus_fit$classification)
par(fig=c(0, 0.5, 0.5, 1))
plot(gaus_fit, what="contour", xmarg=1, ymarg=2, draw.legend=FALSE, fg="gray")
## See ?teigen::plot.teigen for details of the plot command used here.
par(fig=c(0, 0.5, 0, 0.5), new=TRUE)
plot(gaus_fit, what="contour", xmarg=1, ymarg=3, draw.legend=FALSE, fg="gray")
par(fig=c(0.5, 1, 0, 0.5), new=TRUE)
plot(gaus_fit, what="contour", xmarg=2, ymarg=3, draw.legend=FALSE, fg="gray")
par(fig=c(0,1,0,1))
Subsection 9.3.4: Relationship between \(k\)-means and mixture models
Section 9.4: Discriminant analysis
Subsection 9.4.1: Example – plant architecture
<- DAAG::leafshape17
leafshape17 plot(bladelen ~ bladewid, data=leafshape17, pch=c(1,3)[arch+1])
## For panel B, specify log="xy" in the call to plot()
Subsection 9.4.2: Logistic regression
## Fit logistic regression model
<- DAAG::leafshape17
leafshape17 <- glm(arch ~ logwid + loglen, family=binomial(link=logit),
leaf17.glm data=leafshape17)
print(DAAG::sumry(leaf17.glm)$coef, digits=2)
Predictive accuracy
set.seed(29)
<- DAAG::CVbinary(leaf17.glm)
leaf17.cv <- table(DAAG::leafshape17[["arch"]], round(leaf17.cv$cvhat))
tCV rownames(tCV) <- colnames(tCV) <- c("0=Plagiotropic","1=Orthotropic")
cbind(tCV, "Proportion correct"=c(tCV[1,1], tCV[2,2])/(tCV[,1]+tCV[,2]))
round(unlist(leaf17.cv[c("acc.training","acc.cv")]),3)
Subsection 9.4.3: Linear discriminant analysis
suppressPackageStartupMessages(library(MASS))
## Discriminant analysis; data frame leafshape17 (DAAG)
<- lda(arch ~ logwid+loglen, data=DAAG::leafshape17)
leaf17.lda print(leaf17.lda)
Assessments of predictive accuracy
set.seed(29)
<- lda(arch ~ logwid+loglen, data=leafshape17, CV=TRUE)
leaf17cv.lda ## the list element 'class' gives the predicted class
## The list element 'posterior' holds posterior probabilities
<- table(leafshape17$arch, leaf17cv.lda$class)
tab rownames(tab) <- colnames(tab) <- c("0=Plagiotropic","1=Orthotropic")
cbind(tab, "Proportion correct"=c(tCV[1,1], tCV[2,2])/(tCV[,1]+tCV[,2]))
cbind(tab, c(tab[1,1], class.acc=tab[2,2])/(tab[,1]+tab[,2]))
cat("Overall proportion correct =", sum(tab[row(tab)==col(tab)])/sum(tab), "\n")
Subsection 9.4.4: An example with more than two groups
## Linear discriminant calculations for possum data
<- DAAG::possum
possum <- lda(site ~ hdlngth + skullw + totlngth + taill + footlgth +
possum.lda + eye + chest + belly, data=na.omit(possum))
earconch # na.omit() omits any rows that have one or more missing values
plot(possum.lda, dimen=3, col=1:9)
# Scatterplot matrix - scores on 1st 3 canonical variates
# See `?plot.lda` for details of the generic lda plot function
## Linear discriminant calculations for possum data
print(possum.lda, digits=3)
Section 9.5: *High-dimensional data — RNA-Seq gene expression
Setup for installing and using Bioconductor packages
## For latest details, see: https://www.bioconductor.org/install/
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
::install()
BiocManager::install('limma','multtest') BiocManager
Subsection 9.5.1: Data and design matrix setup
<- DAAGbio::plantStressCounts
counts colSums(counts)
## Require at least 3 counts per million that are > 1
<- rowSums(counts)>=3
keep <- counts[keep,] counts
<- factor(rep(c("CTL", "L", "D"), rep(3,3)))
treatment <- model.matrix(~0+treatment)
design colnames(design) <- levels(treatment)
A two-dimensional representation
library(limma)
<- voom(counts, design, plot=TRUE) v
par(oma=c(0,0,1,0))
library(limma)
<- voom(counts, design, plot=TRUE)
v <- substring(colnames(counts),1,1)
firstchar plotMDS(counts, labels=paste0(firstchar, rep(1:3,3)), cex=0.8)
box(col="gray")
mtext(side=3, line=0.4, adj=0, "MDS summary plot")
mtext(side=3, line=-0.25, adj=0.105, "A", outer=TRUE)
mtext(side=3, line=-0.25, adj=0.605, "B", outer=TRUE)
Fitting the model
<- lmFit(v, design) fit
<- c("D-CTL", "L-CTL", "L-D")
contrs <- makeContrasts(contrasts=contrs,
contr.matrix levels=levels(treatment))
<- contrasts.fit(fit, contr.matrix)
fit2 <- eBayes(fit2) efit2
Subsection 9.5.2: From \(p\)-values to false discovery rate (FDR)
## First contrast only; Drought-CTL
print(round(topTable(efit2, coef=1, number=4),15), digits=3)
round(sort(p.adjust(p=efit2$p.value[,1], method="BH"))[1:4], 15) # Not run
round(topTable(efit2, number=4), 15)
head(decideTests(fit2),5)
summary(decideTests(fit2))
## Try also
## summary(decideTests(fit2, p.value=0.001))
Section 9.6: High dimensional data from expression arrays
Subsection 9.6.1: Molecular classification of cancer — an older technology
Breakdown of ALL B-type data, with one observation excluded
library(hddplot)
data(golubInfo)
with(golubInfo, table(cancer, tissue.mf))
## Identify allB samples that are BM:f or BM:m or PB:m
<- with(golubInfo,
subsetB =="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m"))
cancer## Separate off the relevant columns of the matrix Golub
## NB: variables (rows) by cases (columns)
<- with(golubInfo, Golub[, subsetB])
GolubB ## Form vector that identifies these as BM:f or BM:m or PB:m
<- with(golubInfo, tissue.mf[subsetB, drop=TRUE])
tissue.mfB ## Change the level names to leave out the colons
levels(tissue.mfB) <- list("b_f"="BM:f", "b_m"="BM:m", "PBm"="PB:m")
Subsection 9.6.2: Classifications and associated graphs
Preliminary data manipulation
## Display distributions for the first 20 observations
boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations)
## Random selection of 20 rows (features)
boxplot(data.frame(GolubB[sample(1:7129, 20), ]))
Flawed graphs
<- c("red","blue","gray40", "magenta")
colr <- golubInfo[, "tissue.mf"]
tissue.mf <- golubInfo[, "cancer"]
cancer <- Golub[, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE]
G.PBf set.seed(41)
<- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1])
rGolubB rownames(rGolubB) <- rownames(Golub)
<- matrix(rnorm(prod(dim(G.PBf))), nrow=dim(G.PBf)[1])
rG.PBf <- function(x = GolubB, cl=tissue.mfB, x.omit=Golub.PBf, cl.omit="PBf",
plot2 ncol = length(cl), nfeatures=12, device = "", seed = 37,
pretext="", colr=1:3, levnames = NULL,
ylab="2nd discriminant function"){
<- factor(cl)
cl if(!is.null(levnames))levels(cl) <- levnames
<- orderFeatures(x, cl=cl)[1:nfeatures]
ord15 <- t(x[ord15, ])
dfB <- lda(dfB, grouping=cl)
dfB.lda <- predict(dfB.lda, dimen=2)$x
scores <- data.frame(t(x.omit[ord15, drop=FALSE]))
df.PBf colnames(df.PBf) <- colnames(dfB)
<- predict(dfB.lda, newdata=df.PBf)$x
scores.other scoreplot(list(scores=scores, cl=cl, other=scores.other, cl.other=cl.omit, nfeatures=nfeatures), prefix.title=pretext, adj.title=0,
fg="gray", params=list(other=list(pch=4, cex=1.5)),
xlab="1st discriminant function", ylab=ylab)
}plot2(x = GolubB, cl = tissue.mfB, x.omit=G.PBf, cl.omit="PBf",
nfeatures=15, device = "", seed = 37, ylab="2nd discriminant function",
colr=colr, pretext="A: ALL B-cell:")
plot2(x = rGolubB, cl = tissue.mfB, x.omit=rG.PBf, cl.omit="Gp 4",
device = "", seed = 37, colr=colr, levnames = c("Gp 1", "Gp 2", "Gp 3"),
pretext="B: Random data:", ylab="")
## Uses orderFeatures() (hddplot); see below
<- orderFeatures(GolubB, cl=tissue.mfB)[1:15] ord15
## Panel A: Take 1st 15 features & transpose to observations by features
<- data.frame(t(GolubB[ord15, ]))
dfB15 <- MASS::lda(dfB15, grouping=tissue.mfB)
dfB15.lda <- predict(dfB15.lda, dimen=2)$x
scores ## Scores for the single PB:f observation
<- with(golubInfo, tissue.mf=="PB:f"& cancer=="allB")
chooseCols <- data.frame(t(Golub[ord15, chooseCols, drop=FALSE]))
df.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x
scores.PBf ## Warning! The plot that now follows may be misleading!
## Use hddplot::scoreplot()
scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf, cl.other="PB:f"),
fg="gray")
## Panel B: Repeat plot, now with random normal data
<- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)),
simscores cl.other=4, nfeatures=15, seed=41)
# Returns list elements: scores, cl, scores.other & cl.other
scoreplot(simscores)
Subsection 9.6.3: The mean-variance relationship
par(oma=c(0,0,1,0))
<- model.matrix(~0+tissue.mfB)
designG colnames(designG) <- levels(tissue.mfB)
<- vooma(GolubB, designG, plot=TRUE) # Panel A
vG plotMDS(vG, pch=unclass(tissue.mfB), cex=0.8) # Panel B
<- c("BM:female","BM:male","PB:female")
leglabs legend(x="bottomright", bty="n", legend=leglabs, pch=1:3)
mtext(side=3, line=0.4, adj=0, "MDS summary plot")
mtext(side=3, line=-0.275, adj=0.085, "A", outer=TRUE)
mtext(side=3, line=-0.275, adj=0.585, "B", outer=TRUE)
Cross-validation for a range of choices of number of features
## Cross-validation to determine the optimum number of features
## 10-fold (x4). Warning messages are omitted.
## Accuracy measure will be: tissue.mfB.cv$acc.cv
<- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:23,
tissue.mfB.cv nfold=c(10,4), print.progress=FALSE)
## Defective measures will be in acc.resub (resubstitution)
## and acc.sel1 (select features prior to cross-validation)
<- defectiveCVdisc(GolubB, cl=tissue.mfB,
tissue.mfB.badcv foldids=tissue.mfB.cv$folds, nfeatures=1:23)
##
## Calculations for random normal data:
set.seed(43)
<- matrix(rnorm(prod(dim(GolubB))), nrow=nrow(GolubB))
rGolubB <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23,
rtissue.mfB.cv nfold=c(10,4), print.progress=FALSE)
<- defectiveCVdisc(rGolubB, cl=tissue.mfB,
rtissue.mfB.badcv nfeatures=1:23,
foldids=rtissue.mfB.cv$folds)
Which features?
<- matrix(tissue.mfB.cv$genelist[1:3, ,], nrow=3)
genelist <- table(genelist, row(genelist))
tab <- order(tab[,1], tab[,2], tab[,3], decreasing=TRUE)
ord tab[ord,]
Subsection 9.6.4: Graphs derived from the cross-validation process
## Uses tissue.mfB.acc from above
<-
tissue.mfB.scores cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL)
scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
prefix="B-cell subset -", fg='gray')
Subsection 9.6.5: Estimating contrasts, and calculating False Discovery Rates
<- lmFit(vG, designG)
fitG <- c("b_f-b_m", "b_f-PBm", "b_m-PBm")
contrs <- makeContrasts(contrasts=contrs,
contr.matrix levels=levels(tissue.mfB))
<- contrasts.fit(fitG, contr.matrix)
fit2 <- eBayes(fit2) fit2
From \(p\)-values to false discovery rate (FDR)
print(topTable(fit2, number=5), digits=2)
summary(decideTests(fit2))
## Try also
## summary(decideTests(fit2, p.value=0.001))
Section 9.7: Causal inference from observational data — balance and matching
Subsection 9.7.1: Tools for the task
library(DAAG)
## Columns 4:7 are factors; columns 9:10 (re75 & re78) are continuous
<- matrix(0, ncol=6, nrow=8)
propmat dimnames(propmat) <- list(c("psid1", "psid2", "psid3", "cps1", "cps2", "cps3",
"nsw-ctl", "nsw-trt"), names(nswdemo)[c(4:7, 9:10)])
for(k in 1:8){
<- switch(k, psid1, psid2, psid3, cps1, cps2, cps3,
dframe subset(nswdemo, trt==0), subset(nswdemo, trt==1))
<- c(sapply(dframe[,4:7], function(x){
propmat[k,] <- table(x); z[2]/sum(z)}),
z sapply(dframe[,9:10], function(x)sum(x>0)/sum(!is.na(x))))
}
<- DAAG::DAAGtheme(color=TRUE)
PGtheme library(DAAG)
if(!require(grid))return("Package 'grid' is not installed -- cannot proceed")
<- c("nsw-ctl", "nsw-trt", "psid1", "psid2", "psid3",
dsetnames "cps1", "cps2", "cps3")
<- c("gray","black", PGtheme$superpose.line$col[1:3])
colrs <- c(1,2,1,1,1)
lty <- c(1,0.75,0.75,0.75,0.75)
lwd <-
denplot function(sel=c(1:2,6:8), yvar="re75", offset=30, ylim=c(0,1.75),
from=NULL, at=c(.5,1,1.5), labels=paste(at), bw="nrd0",
ylab="Density", takelog=TRUE, col.axis="black"){
<- unlist(lapply(list(subset(nswdemo, trt==0),
nzre subset(nswdemo, trt==1),
psid1, psid2, psid3, cps1, cps2, cps3)[sel],function(x){z <- x[,yvar]; z[z>0]}))
<- unlist(lapply(list(subset(nswdemo, trt==0), subset(nswdemo, trt==1),
num
psid1, psid2, psid3, cps1, cps2, cps3),function(x){z <- x[,yvar]; sum(z>0)}))
<- data.frame(nzre=nzre, fac = factor(rep(dsetnames[sel], num[sel]),
xy levels=dsetnames[sel]))
if(takelog) {
<- log(xy$nzre+offset)
y <- paste("log(", yvar, "+", offset, ")", sep="")} else
xlab
{<- xy$nzre
y <- yvar
xlab
}densityplot(~ y, groups=fac, data=xy, bw=bw, from=from,
scales=list(y=list(at=at, labels=labels, col=col.axis), tck=0.25),
plot.points=FALSE, col=colrs[1:5], lwd=lwd, lty=lty,
key=list(x=0.01, y=0.99, text=list(dsetnames[sel[3:5]]), col=colrs[3:5],
cex=0.75, lines=list(lwd=rep(1.5,3)), between=1),
par.settings=list(col=colrs, lty=lty, cex=0.75, lwd=lwd,
fontsize=list(text=9, points=5)),
fg="gray", ylim=ylim, ylab=ylab, xlab=xlab)
}## Plot base graph; overlay with lattice graphs on same page
par(fig=c(0,1,0,1), mar=c(0,0,0,0))
plot(0:1,0:1, axes=FALSE, type="n", bty="n", xlab="", ylab="")
legend(x="top",legend=dsetnames[1:2], lty=1:2, lwd=c(1,0.75),
col=colrs[1:2], bty="n", ncol=2, yjust=0.75)
print(denplot(), position=c(0, 0, 0.32, 0.505), newpage=FALSE)
print(denplot(1:5, ylab=" ", col.axis="white"),
position=c(0.21, 0, .53, 0.505), newpage=FALSE)
print(denplot(ylab=" ", yvar="re78", col.axis="white"),
position=c(0.47, 0, 0.79, 0.505), newpage=FALSE)
print(denplot(1:5, ylab=" ", yvar="re78", col.axis="white"),
position=c(0.68, 0, 1, 0.505), newpage=FALSE)
## Age
print(denplot(yvar="age", takelog=FALSE, ylim=c(0,0.1), from=16,
at=c(.02,.04,.06,.08), labels=c(".02",".04",".06",".08")),
position=c(0, 0.475, 0.32, .98), newpage=FALSE)
print(denplot(1:5, yvar="age", takelog=FALSE, ylim=c(0,0.1), from=16,
at=c(.02,.04,.06,.08), labels=c(".02",".04",".06",".08"),
ylab=" ", col.axis="white"),
position=c(0.21, 0.475, .53, .98), newpage=FALSE)
## educ
print(denplot(1:5, yvar="educ", takelog=FALSE, ylim=c(0,0.5), bw=0.5,
at=c(.1,.2,.3,.4), ylab=" "),
position=c(0.47, 0.475, .79, .98), newpage=FALSE)
print(denplot(yvar="educ", takelog=FALSE, ylim=c(0,0.75), bw=0.5,
at=c(.1,.2,.3,.4), ylab=" ", col.axis="white"),
position=c(0.68, 0.475, 1, .98), newpage=FALSE)
<-
addControl function(control, offset=30){
<- deparse(substitute(control))
nam if(nam=="nswdemo")nsw0 <- nswdemo else
<- rbind(control, subset(DAAG::nswdemo, trt==1))
nsw0 $z75 <- factor(nsw0$re75==0, labels=c("0",">0"))
nsw0$ethnicid <- factor(with(nsw0, ifelse(black==1, "black",
nsw0ifelse(hisp==1, "hisp", "other"))), levels=c("other","black","hisp"))
<- nsw0[, -match(c("black","hisp"), names(nsw0))]
nsw0
nsw0 }
## Create dataset that will be used for later analyses
<- addControl(psid1)
nsw <- within(nsw, {re75log <- log(re75+30);
nsw <- log(re78+30);
re78log <- factor(trt, labels=c("Control","Treat"))})
trt ## A treated values only dataset will be required below
<- subset(nsw, trt=="Treat")
trtdat $pres74 <- factor(!is.na(trtdat$re74), labels=c("<NA>","pres"))
trtdattable(trtdat$pres74)
with(trtdat, table(pres74,z75))
Subsection 9.7.2: Regression comparisons
Regression calculations
<- gam(log(re78+30)~ trt + ethnicid + z75 + nodeg + s(age) +
nsw.gam s(educ) + log(re75+30), data=nsw)
Subsection 9.7.3: The use of scores to replace covariates
Subsection 9.7.4: Two-dimensional representation using randomForest proximities
suppressPackageStartupMessages(library(randomForest))
<- trt ~ age + educ + ethnicid + marr + nodeg + z75 + re75log
form <- randomForest(form, data=nsw, sampsize=c(297,297))
nsw.rf <- predict(nsw.rf,type="prob")[,2]
p.rf <- log((p.rf+0.001)/(1-p.rf+0.001)) sc.rf
<- match(c("PropScore","weights","subclass"), names(dat2RF), nomatch=0)
omitn <-matchit(trt ~ age + educ + ethnicid + marr + nodeg + z75 +
matchISO.rf ratio=1, data=dat2RF[,-omitn], distance=isoScores[,1])
re75log, ## summary(match.rf,un=F,improvement=F)
## summary(match.rf, un=F, interactions=T, improvement=F)$sum.matched[,1:4]
## In the first place, look only at the first 4 columns
<- match.data(matchISO.rf, distance="PropScore")
dat1RF <- lm(re78log ~ trt, data = dat1RF, weights = weights)
dat1RF.lm library(sandwich) # Allows use of `vcovCL()` from the `sandwich` package
::coeftest(dat1RF.lm, vcov. = vcovCL, cluster = ~subclass)
lmtest## Check for increase in number with non-zero earnings
<- glm(I(re78>0) ~ trt, data = dat1RF, weights = weights,
dat1RF.glm family=binomial)
::coeftest(dat1RF.glm, vcov. = vcovCL, cluster = ~subclass) lmtest
Derivation and investigation of scores
library(mgcv)
<- trt ~ ethnicid + marr+ z75 + s(age) + s(educ) + s(re75log)
formG <- gam(formG, family=binomial(link="logit"), data=nsw)
nsw.gam <- predict(nsw.gam, type='response')
pred table(nsw$trt, round(pred))
## Alternative
library(splines) ## Fit normal cubic splines using splines::ns()
<- trt ~ ethnicid + marr+ z75 + ns(age,2) +
formNS ns(educ) + ns(re75log,3)
<- glm(formNS, family=binomial(link="logit"), data=nsw)
nsw.glm <- predict(nsw.glm, type='response')
pred table(nsw$trt, round(pred))
cbind(AIC(nsw.glm,nsw.gam), BIC(nsw.glm, nsw.gam))
## Include factor by factor and variable interactions with ethnicid
## and marr (Result not shown)
<- trt ~ (ethnicid+marr+z75)^2 + s(age, by=ethnicid)+
formGx s(educ, by=ethnicid) + s(re75log,by=ethnicid)+
s(age, by=marr)+ s(educ, by=marr) + s(re75log,by=marr)
<- gam(formula = formGx, data = nsw, family=binomial(link = "logit"))
nswx.gam <- predict(nswx.gam, type='response')
predx table(nsw$trt, round(predx))
AIC(nsw.glm,nsw.gam,nswx.gam)
library(MatchIt)
## Use data frame that omits re74. Otherwise matchit() will generate NAs
## where they occur in re74, even though re74 is not in the model formula.
<- nsw[, c("trt","age","educ","ethnicid", "marr","nodeg","z75",
nswG "re75log","re78log","re78")]
<- trt ~ ethnicid + marr+ z75 + s(age) + s(educ) + s(re75log)
formG <- matchit(formula = formG, data = nswG, method = "nearest",
match.gam distance = "gam", link = "logit", reestimate=TRUE)
<- match.data(match.gam, distance="PropScore")
datG ## Summary information
match.gam## summary(match.gam,un=F,improvement=F)
## summary(match.gam, un=F, interactions=T, improvement=F)$sum.matched[,1:4]
## In the first place, look only at the first 4 columns
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cobalt))
<- cobalt::love.plot(match.gam, position="bottom", grid=TRUE,
gg1star.char="",stars='raw') +
ggtitle("A: Differences from balance") +
theme(plot.title = element_text(hjust=0, vjust=0.5, size=11),
plot.margin=unit(c(9,15,0,9), 'pt'))
<- match(with(subset(datG, trt=="Control"),subclass),
sub with(subset(datG, trt=="Treat"),subclass))
<- cbind(subset(datG, trt=="Treat"),
datGpaired with(subset(datG, trt=="Control")[sub,],
cbind("Cre78log"=re78log,"CPropScore"=PropScore)))
<- ggplot(datGpaired)+
gg2 geom_point(aes(PropScore,I(re78log-Cre78log)), size=1)+
geom_smooth(aes(PropScore,I(re78log-Cre78log)), method = "gam",
formula = y ~ s(x, bs = "cs")) +
xlab("Propensity score for treated")+
ylab("Treatment vs control differences") +
ggtitle("B: Treatment vs control differences") +
theme(plot.title = element_text(hjust=0, vjust=0.5, size=11),
plot.margin=unit(c(9,9,0,15), 'pt'))
grid.arrange(gg1, gg2, ncol=2)
library(sandwich)
<- lm(re78log ~ trt, data = datG, weights = weights)
datG.lm ## With 1:1 matching, the weights argument is not really needed
## Print first two coefficients only.
::coeftest(datG.lm, vcov. = vcovCL, cluster = ~subclass)[1:2,]
lmtest## Check number whose income was greater than 0
<- glm(I(re78>0) ~ trt, data = datG, weights = weights, family=binomial)
datG.glm ::coeftest(datG.glm, vcov. = vcovCL, cluster = ~subclass)[1:2,] lmtest
Alternative matching approaches
Subsection 9.7.5: Coarsened exact matching
<- trt ~ age + educ + ethnicid + marr + nodeg + z75 + re75log
form <- matchit(formula=form, data=nswG, method="cem", cutpoints=5)
match5.cem <- match.data(match5.cem)
datcem5 <- matchit(formula=form, data=nswG, method="cem", cutpoints=6)
match6.cem <- match.data(match6.cem)
datcem6 ## Show the effect of adding another cutpoint
match5.cem match6.cem
library(sandwich)
<- lm(re78log ~ trt, data = datcem5, weights = weights)
datcem5.lm ## The function vcovHC() provides cluster robust standard errors
::coeftest(datcem5.lm, vcov. = vcovHC)
lmtest## Estimate treatment effect on number with some earnings:
<- glm(I(re78>0) ~ trt, data = datcem6, weights = weights,
datcem6.glm family=binomial)
::coeftest(datcem6.glm, vcov. = vcovHC) lmtest
Section 9.8: Multiple imputation
suppressPackageStartupMessages(library(mice))
<- with(subset(mice::boys, age>=9),
Boys data.frame(age=age, loghgt=log(hgt), logbmi=log(bmi), loghc=log(hc)))
<- md.pattern(Boys, plot=F)) (Pattern
set.seed(31) # Set to reproduce result shown
<- rbind(Pattern[-c(1,nrow(Pattern)), -ncol(Pattern)],
PatternB c(0,1,1,1), c(0,1,0,0), c(0,0,1,0))
<- rbind(ic(Boys),
boys ampute(cc(Boys), pattern=PatternB, freq=c(.3,.15,.15,.2,.1,.1),
prop=0.75)$amp)
md.pattern(boys, plot=FALSE)
set.seed(17) # Set to reproduce result shown
<- capture.output( # Evaluate; send screen output to text string
out <- mice(boys, method='pmm', m=8) )
boys.mids <- complete(boys.mids, action='all') # Returns a list of m=8 dataframes
impDFs ## Average over imputed dataframes (use for exploratory purposes only)
<- sapply(impDFs, function(x)as.matrix(x), simplify='array')
impArray <- as.data.frame(apply(impArray, 1:2, mean)) boysAv
<- with(boys.mids, lm(logbmi~age+loghgt))
fits <- summary(pool(fits)) # Include in table below pool.coef
## 2) Regression that leaves out rows with NAs
<- coef(summary(lm(logbmi~age+loghgt, data=boys)))
omitNArows.coef ## 3) Regression fit to average over data frames after imputation
<- coef(summary(lm(logbmi~age+loghgt, data=boysAv)))
boysAv.coef ## 4) Fit to original data, with 36 rows had missing data
<- coef(summary(lm(logbmi ~ age+loghgt, data=Boys))) Orig.coef
<- cbind(summary(pool(fits))[,2:3], omitNArows.coef[,1:2], boysAv.coef[,1:2],
ctab 1:2])
Orig.coef[,<- setNames(cbind(ctab[,c(1,3,5,7)], ctab[,c(2,4,6,8)]),
tab paste0(rep(c('Est','SE'), c(4,4)), rep(1:4, 2)))
round(tab,3)
Time series cross-sectional data – an example
<- datasets::airquality
airquality <- cbind(airquality[, 1:4], day=1:nrow(airquality))
airq # 'day' (starting May 1) replaces columns 'Month' & 'Day')
## Replace `Ozone` with `rt4ozone`:
<- cbind(rt4ozone=airq$Ozone^0.25, airq[,-1]) airq
## Generate the scatterplot matrix, now with `rt4ozone` replacing `Ozone`
<- list(col.smooth='red', lty.smooth=2, spread=0)
smoothPars ::spm(airq, cex.labels=1.2, regLine=FALSE, col='blue',
caroma=c(1.95,3,4, 3), gap=.25, smooth=smoothPars)
<- mice(airq, m=20, print=FALSE)
airq.imp ## 20 imputations shows up issues of concern very clearly
## Code for figure
<- micemd::overimpute(airq.imp) out
Section 9.9: Further reading
Section 9.10: Exercises
9.3
library(DAAG)
::oz(sections=c(3:5, 11:16))
oznames(possumsites)[1:2] <- c("long", "lat")
with(possumsites, {
points(long, lat);
text(long, lat, row.names(possumsites), pos=c(2,4,2,2,4,2,2))
})
9.7
data(wine, package='gclus')
<- with(wine,
mat round(1-cor(cbind(Alcohol, Malic, Magnesium, Phenols, Flavanoids)),2))
colnames(mat) <- rownames(mat) <- 1:5
print(mat)
9.9a
`confusion` <-
function(actual, predicted, digits=4){
<- table(actual, predicted)
tab <- apply(tab, 1, function(x)x/sum(x))
confuse print(round(confuse, digits))
<- sum(tab[row(tab)==col(tab)])/sum(tab)
acc invisible(print(c("Overall accuracy" = round(acc,digits))))
}data(Vehicle, package="mlbench")
<- MASS::lda(Class ~ ., data=Vehicle, CV=TRUE)$class
lhat <- MASS::qda(Class ~ ., data=Vehicle, CV=TRUE)$class
qhat ::confusion(Vehicle$Class, lhat)
DAAG::confusion(Vehicle$Class, qhat)
DAAG::randomForest(Class ~ ., data=Vehicle, CV=TRUE) randomForest
9.9c
<- MASS::lda(Class ~ ., data=Vehicle)
Vehicle.lda <- predict(Vehicle.lda)$x
twoD ::quickplot(twoD[,1], twoD[,2], color=Vehicle$Class,
ggplot2geom=c("point","density2d"))
9.10
library(ape); library(MASS)
library(DAAGbio)
<- as.DNAbin(primateDNA)
primates.dna <- dist.dna(primates.dna, model="K80")
primates.dist <- cmdscale(primates.dist)
primates.cmd eqscplot(primates.cmd)
<- c(4,2,4,2)[unclass(cut(primates.cmd[,1], breaks=4))]
rtleft text(primates.cmd, labels=row.names(primates.cmd), pos=rtleft)
<- dist(primates.cmd)
d sum((d-primates.dist)^2)/sum(primates.dist^2)
9.11
library(DAAG)
<- dist(x = as.matrix(rockArt[-c(47,54,60,63,92),28:641]),
pacific.dist method = "binary")
sum(pacific.dist==1)/length(pacific.dist)
plot(density(pacific.dist, to = 1))
## Check that all columns have at least one distance < 1
<- as.matrix(pacific.dist)
symmat table(apply(symmat, 2, function(x) sum(x<1)))
<- cmdscale(pacific.dist)
pacific.cmd <- sammon(pacific.dist) pacific.sam
9.15
<- setNames(cbind(stack(wine, select=2:14), rep(wine[,-1], 13)),
Wine c("value", "measure", "Class"))
bwplot(measure ~ value, data=Wine)
<- prcomp(wine[,-1], scale=TRUE)
wine.pr round(wine.pr$sdev,2)
t(round(wine.pr$rotation[,1:2],2))
<- as.data.frame(cbind(predict(wine.pr), Class=wine[,1]))
scores xyplot(PC2 ~ PC1, groups=Class, data=scores, aspect='iso',
par.settings=simpleTheme(pch=16), auto.key=list(columns=3))
library(MASS)
<- lda(Class ~ ., data=wine)
wine.lda <- lda(Class ~ ., data=wine, CV=T)
wineCV.lda t(round(wine.lda$scaling,2))
<- table(wine$Class, wineCV.lda$class,
tab dnn=c('Actual', 'Predicted'))
tabsetNames(round(1-sum(diag(tab))/sum(tab),4), "CV error rate")
<- as.data.frame(cbind(predict(wine.lda)$x, Class=wine[,1]))
scores xyplot(LD2 ~ LD1, groups=Class, data=scores, aspect='iso',
par.settings=simpleTheme(pch=16), auto.key=list(columns=3))
$Class <- factor(Wine$Class)
wine<- randomForest(x=wine[,-1], y=wine$Class) wine.rf
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch9.R")
}