1 Learning from data
library(knitr)
'set']](fig.width=6, fig.height=6, comment=" ",
opts_chunk[[out.width="80%", fig.align="center", fig.show='hold',
size="small", ps=10, strip.white = TRUE,
tidy.opts = list(replace.assign=FALSE))
## xtras=TRUE ## Set to TRUE to execute code 'extras'
<- FALSE
xtras library(knitr)
## opts_chunk[['set']](results="asis")
## opts_chunk[['set']](eval=FALSE) ## Set to TRUE to execute main part of code
'set']](eval=FALSE) opts_chunk[[
Packages required (plus any dependencies)
latticeExtra (lattice is a dependency); DAAG; car; MASS; AICcmodavg; BayesFactor; boot; MPV; ggplot2; tidyr
Additionally, knitr and Hmisc are required in order to process the Rmd source file. The prettydoc package is by default used to format the html output.
Section 1.1: Questions, and data that may point to answers
Subsection 1.1.1: A sample is a window into the wider population
## For the sequence below, precede with set.seed(3676)
set.seed(3696)
sample(1:9384, 12, replace=FALSE) # NB: `replace=FALSE` is the default
<- sample(1:19384, 1200, replace=FALSE) chosen1200
## For the sequence below, precede with set.seed(366)
set.seed(366)
split(sample(seq(1:10)), rep(c("Control","Treatment"), 5))
# sample(1:10) gives a random re-arrangement (permutation) of 1, 2, ..., 10
*A note on with-replacement samples
sample(1:10, replace=TRUE)
## sample(1:10, replace=FALSE) returns a random permutation of 1,2,...10
Subsection 1.1.2: Formulating the scientific question
Example: a question about cuckoo eggs
library(latticeExtra) # Lattice package will be loaded and attached also
<- DAAG::cuckoos
cuckoos ## Panel A: Dotplot without species means added
dotplot(species ~ length, data=cuckoos) ## `species ~ length` is a 'formula'
## Panel B: Box and whisker plot
bwplot(species ~ length, data=cuckoos)
## The following shows Panel A, including species means & other tweaks
<- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av dotplot(species ~ length, data=cuckoos, alpha=0.4, xlab="Length of egg (mm)") +
as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
# Use `+` to indicate that more (another 'layer') is to be added.
# With `alpha=0.4`, 40% is the point color with 60% background color
# `pch=3`: Plot character 3 is '+'; `cex=1.4`: Default char size X 1.4
## Code
suppressPackageStartupMessages(library(latticeExtra, quietly=TRUE))
<- DAAG::cuckoos
cuckoos ## For tidier labels replace ".", in several of the names, by a space
<- with(cuckoos, sub(pattern=".", replacement=" ",
specnam levels(species), fixed=TRUE))
# fixed=TRUE: "interpret "." as ".", not as a 'any single character'"
<- within(cuckoos, levels(species) <- specnam)
cuckoos ## Panel A: Dotplot: data frame cuckoos (DAAG)
<- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av <- dotplot(species ~ length, data=cuckoos, alpha=0.4) +
gphA as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
# alpha sets opacity. With alpha=0.4, 60% of the background shows through
# Enter `print(plt1)` or `plot(plt1)` or simply `plt1` to display the graph
## Panel B: Box plot
<- bwplot(species ~ length, data=cuckoos)
gphB update(c("A: Dotplot"=gphA, "B: Boxplot"=gphB), between=list(x=0.4),
xlab="Length of egg (mm)")
## latticeExtra::c() joins compatible plots together.
## See `?latticeExtra::c`
Subsection 1.1.3: Planning for a statistical analysis
Subsection 1.1.4: Results that withstand thorough and informed challenge
Subsection 1.1.5: Using graphs to make sense of data
Subsection 1.1.6: Formal model-based comparison
options(width=70)
<- DAAG::cuckoos
cuckoos <- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av setNames(round(av[["x"]],2), abbreviate(av[["species"]],10))
with(cuckoos, scale(length[species=="wren"], scale=FALSE))[,1]
Section 1.2: Graphical tools for data exploration
Subsection 1.2.1: Displays of a single variable
library(latticeExtra, quietly=TRUE)
<- subset(DAAG::possum, sex=="f")
fossum <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
femlen ## Panel A
<- c(0,5,10,15,20)/(5*nrow(fossum))
yaxpos <- boxplot(list(val = femlen), plot = FALSE)
z <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100),
gph1 scales=list(y=list(draw=FALSE)))+
::layer(panel.rug(x,pch="|"))
latticeExtra<- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
legstat tx=c("Outlier?", "Smallest value", "lower quartile", "median",
"upper quartile", "Largest value"),
tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
<- gph1+latticeExtra::layer(data=legstat,
gphA panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
<- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100),
gph2 plot.points=TRUE, pch="|",cex=1.75, ylab=c(""," Density"))
<- histogram(~femlen, ylim=c(0,0.108), type="density",
gph3 scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")),
alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100),
col='transparent',border='gray40')
<- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gph4 <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
gphB scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB,
"A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F),
as.table=TRUE, between=list(y=1.4),
xlab="Total length of female possums (cm)")
<- subset(DAAG::possum, sex=="f")
fossum densityplot(~totlngth, plot.points=TRUE, pch="|", data=fossum) +
layer_(panel.histogram(x, type="density", breaks=c(75,80,85,90,95,100)))
Comparing univariate displays across factor levels
library(latticeExtra, quietly=TRUE)
<- subset(DAAG::possum, sex=="f")
fossum <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
femlen ## Panel A
<- c(0,5,10,15,20)/(5*nrow(fossum))
yaxpos <- boxplot(list(val = femlen), plot = FALSE)
z <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100),
gph1 scales=list(y=list(draw=FALSE)))+
::layer(panel.rug(x,pch="|"))
latticeExtra<- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
legstat tx=c("Outlier?", "Smallest value", "lower quartile", "median",
"upper quartile", "Largest value"),
tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
<- gph1+latticeExtra::layer(data=legstat,
gphA panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
<- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100),
gph2 plot.points=TRUE, pch="|",cex=1.75, ylab=c(""," Density"))
<- histogram(~femlen, ylim=c(0,0.108), type="density",
gph3 scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")),
alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100),
col='transparent',border='gray40')
<- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gph4 <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
gphB scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB,
"A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F),
as.table=TRUE, between=list(y=1.4),
xlab="Total length of female possums (cm)")
## Create boxplot graph object --- Simplified code
<- bwplot(Pop~totlngth | sex, data=possum)
gph ## plot graph, with dotplot distribution of points below boxplots
+ latticeExtra::layer(panel.dotplot(x, unclass(y)-0.4)) gph
Subsection 1.2.2: Patterns in univariate time series
layout(matrix(c(1,2)), heights=c(2.6,1.75))
<- DAAG::measles
measles ## Panel A:
par(mgp=c(2.0,0.5,0))
plot(log10(measles), xlab="", ylim=log10 (c(1,5000*540)),
ylab=" Deaths", yaxt="n", fg="gray", adj=0.16)
<-ts(c(1088, 1258, 1504, 1778, 2073, 2491, 2921, 3336, 3881,
londonpop 4266, 4563, 4541, 4498, 4408), start=1801, end=1931, deltat=10)
points(log10(londonpop*500), pch=16, cex=.5)
<- c(1, 10, 100, 1000)
ytiks1 axis(2, at=log10(ytiks1), labels=paste(ytiks1), lwd=0, lwd.ticks=1)
abline(h=log10(ytiks1), col = "lightgray", lwd=2)
par(mgp=c(-2,-0.5,0))
<- c(1000000, 5000000) ## London population in thousands
ytiks2 abline(h=log10(ytiks2*0.5), col = "lightgray", lwd=1.5)
abline(v=seq(from=1650,to=1950,by=50), col = "lightgray", lwd = 1.5)
mtext(side=2, line=0.5, "Population", adj=1, cex=1.15, las=3)
axis(2, at=log10(ytiks2*0.6), labels=paste(ytiks2), tcl=0.3,
hadj=0, lwd=0, lwd.ticks=1)
mtext(side=3, line=0.3, "A (1629-1939)", adj=0, cex=1.15)
##
## Panel B: window from 1840 to 1882
par(mgp=c(2.0,0.5,0))
plot(window(measles, start=1840, end=1882), xlab="",
ylab="Deaths Pop (1000s)", ylim=c(0, 4200), fg="gray")
points(window(londonpop, start=1840, end=1882), pch=16, cex=0.5)
mtext(side=3, line=0.5, "B (1841-1881)", adj=0, cex=1.15)
Subsection 1.2.3: Visualizing relationships between pairs of variables
Subsection 1.2.4: Response lines (and/or curves)
par(pty="s")
plot(distance.traveled ~ starting.point, data=DAAG::modelcars, fg="gray",
xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",
ylab="Distance traveled (cm)")
Subsection 1.2.5: Multiple variables and times
## Apply function range to columns of data frame jobs (DAAG)
sapply(DAAG::jobs, range) ## NB: `BC` = British Columbia
## Panel A: Basic plot; all series in a single panel; use log y-scale
<- Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date
formRegions <-
basicGphA xyplot(formRegions, outer=FALSE, data=DAAG::jobs, type="l", xlab="",
ylab="Number of workers", scales=list(y=list(log="e")),
auto.key=list(space="right", lines=TRUE, points=FALSE))
## `outer=FALSE`: plot all columns in one panel
## Panel B: Separate panels (`outer=TRUE`); sliced log scale
<-
basicGphB xyplot(formRegions, data=DAAG::jobs, outer=TRUE, type="l", layout=c(3,2),
xlab="", ylab="Number of workers",
scales=list(y=list(relation="sliced", log=TRUE)))
# Provinces are in order of number of workers in Dec96
## Create improved x- and y-axis tick labels; will update to use
<- seq(from=95, by=0.5, length=5)
datelabpos <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"),
datelabs by="6 month", length=5), "%b%y")
## Now create $y$-labels that have numbers, with log values underneath
<- exp(pretty(log(unlist(DAAG::jobs[,-7])), 5))
ylabposA <- paste(round(ylabposA),"\n(", log(ylabposA), ")", sep="")
ylabelsA ## Repeat, now with 100 ticks, to cover all 6 slices of the scale
<- exp(pretty(log(unlist(DAAG::jobs[,-7])), 100))
ylabposB <- paste(round(ylabposB),"\n(", log(ylabposB), ")", sep="")
ylabelsB <- update(basicGphA, scales=list(x=list(at=datelabpos, labels=datelabs),
gphA y=list(at=ylabposA, labels=ylabelsA)))
<- update(basicGphB, xlab="", between=list(x=0.25, y=0.25),
gphB scales=list(x=list(at=datelabpos, labels=datelabs),
y=list(at=ylabposB, labels=ylabelsB)))
<- list(layout.heights=list(top.padding=0,
layout.list bottom.padding=0, sub=0, xlab=0),
fontsize=list(text=8, points=5))
<- modifyList(ggplot2like(pch=1, lty=c(4:6,1:3),
jobstheme col.line='black', cex=0.75),layout.list)
print(update(gphA, par.settings=jobstheme, axis=axis.grid,
main=list("A: Same vertical log scale",y=0)),
position=c(0.1,0.615,0.9,1), newpage=TRUE)
print(update(gphB, par.settings=jobstheme, axis=axis.grid,
main=list("B: Sliced vertical log scale",y=0)),
position=c(0,0,1,0.625), newpage=FALSE)
plot(c(1230,1860), c(0, 10.5), axes=FALSE, bty="n",
xlab="", ylab="", type="n", log="x")
<- c(1366, 1436, 1752, 1840)
xpoints axis(1, at=xpoints, labels=FALSE, tck=0.01, lty=1, lwd=0, lwd.ticks=1)
for(i in 1:4){
axis(1, at=xpoints[i],
labels=substitute(italic(a), list(a=paste(xpoints[i]))),
line=-2.25, lty=0, cex=0.8, lwd=0, lwd.ticks=1)
lines(rep(xpoints[i],2), c(0, 0.15*par()[["cxy"]][2]), lty=1)
}<- 1250*cumprod(c(1, rep(1.2,2)))
axpos axis(1, at=c(axpos,1840), labels=F, lwd.ticks=0)
<- round(axpos)
lab axis(1, at=axpos, labels=lab)
<- lapply(round(log2(xpoints),3), function(x)substitute(2^a, list(a=x)))
lab2 axis(1, at=xpoints, labels=as.expression(lab2), line=-3.5, lwd=0)
<- lapply(format(round(log(xpoints),3)), function(x)substitute(e^a, list(a=x)))
labe axis(1, at=xpoints, labels=as.expression(labe), line=-5, lwd=0)
<- lapply(round(log10(xpoints),3), function(x)substitute(10^a, list(a=x)))
lab10 axis(1, at=xpoints, labels=as.expression(lab10), line=-6.5, lwd=0)
par(family="mono", xpd=TRUE)
axis(1, at=1220, labels="log=2", line=-3.5, hadj=0, lwd=0)
axis(1, at=1220, labels='log="e"', line=-5, hadj=0, lwd=0)
axis(1, at=1220, labels="log=10", line=-6.5, hadj=0, lwd=0)
<- strwidth("log=2")
wid2 par(family="sans")
Subsection 1.2.6: *Labeling technicalities
Subsection 1.2.7: Graphical displays for categorical data
<- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2),
stones dimnames=list(Success=c("yes","no"),
Method=c("open","ultrasound"), Size=c("<2cm", ">=2cm")))
<- margin.table(stones, margin=1:2) margin12
<- 100*prop.table(margin12, margin=2)
byMethod <- 100*prop.table(stones, margin=2:3)["yes", , ]
pcGood <- dimnames(stones)
dimnam <- margin.table(stones, margin=2:3)
numOps <- data.frame(Good=c(pcGood[1,],pcGood[2,]),
opStats numOps=c(numOps[1,], numOps[2,]),
opType=factor(rep(dimnam[["Method"]],c(2,2))),
Size=rep(dimnam[["Size"]],2))
<- range(opStats$Good)*c(0.65,1.015)
xlim <- c(0, max(opStats$numOps)*1.15)
ylim plot(numOps~Good, data=opStats, type="h", lwd=4, xlim=xlim, ylim=ylim,
fg="gray",col=rep(c("blue","red"),rep(2,2)),
xlab="Success rate (%)", ylab="Number of operations")
# with(opStats, text(numOps~Good, labels=Size,
# col=rep(c('blue','red'),rep(2,2)),
# offset=0.25,pos=3, cex=0.75))
<- lapply(split(opStats, opStats$Size),
labpos function(x)apply(x[,1:2],2,function(z)c(z[1],mean(z),z[2])))
<- names(labpos)
sizeNam lapply(labpos, function(x)lines(x[,'Good'],x[,'numOps']+c(0,35,0),
type="c",col="gray"))
<- sapply(labpos, function(x)c(x[2,'Good'],x[2,'numOps']+35))
txtmid text(txtmid[1,]+c(-1.4,0.85),txtmid[2,],labels=sizeNam,col="gray40",
pos=c(4,2), offset=0)
par(xpd=TRUE)
text(byMethod[1,1:2],rep(par()$usr[4],2)+0.5*strheight("^"), labels=c("^","^"),
col=c("blue","red"),cex=1.2,srt=180)
text(byMethod[1,], par()$usr[4]+1.4*strheight("A"),
labels=paste(round(byMethod[1,],1)),cex=0.85)
text(byMethod[1,1:2]+c(3.5,-3.5), rep(par()$usr[4],2)+2.65*strheight("A"),
labels=c("All open","All ultrasound"), pos=c(2,4))
par(xpd=FALSE)
abline(h=100*(0:2),col="lightgray",lwd=0.5)
abline(v=10*(5:9),col="lightgray",lwd=0.5)
legend("topleft", col=c('blue','red'),lty=c(1,1), lwd=1, cex=0.9,
y.intersp=0.75, legend=c("Open","Ultrasound"),bty="n",
inset=c(-0.01,-0.01))
Subsection 1.2.8: What to look for in plots
Section 1.3: Data Summary
Subsection 1.3.1: Counts
## Table of counts example: data frame nswpsid1 (DAAG)
## Specify `useNA="ifany"` to ensure that any NAs are tabulated
<- with(DAAG::nswpsid1, table(trt, nodeg, useNA="ifany"))
tab dimnames(tab) <- list(trt=c("none", "training"), educ = c("completed", "dropout"))
tab
Tabulation that accounts for frequencies or weights – the xtabs()
function
<- lattice::bwplot(log(nassCDS$weight+1), xlab="Inverse sampling weights",
gph scales=list(x=list(at=c(0,log(c(10^(0:5)+1))), labels=c(0,10^(0:5)))))
update(gph, par.settings=DAAG::DAAGtheme(color=F, col.points='gray50'))
<- table(nassCDS$dead)
sampNum <- as.vector(xtabs(weight ~ dead, data=nassCDS))
popNum rbind(Sample=sampNum, "Total number"=round(popNum,1))
<- DAAG::nassCDS
nassCDS <- xtabs(weight ~ airbag + dead, data=nassCDS)/1000
Atab ## Define a function that calculates Deaths per 1000
<- function(x)1000*x[2]/sum(x)
DeadPer1000 <- ftable(addmargins(Atab, margin=2, FUN=DeadPer1000))
Atabm print(Atabm, digits=2, method="compact", big.mark=",")
<- xtabs(weight ~ seatbelt + airbag + dead, data=nassCDS)
SAtab ## SAtab <- addmargins(SAtab, margin=3, FUN=list(Total=sum)) ## Gdet Totals
<- ftable(addmargins(SAtab, margin=3, FUN=DeadPer1000), col.vars=3)
SAtabf print(SAtabf, digits=2, method="compact", big.mark=",")
<- xtabs(weight ~ dvcat + seatbelt + airbag + dead, data=nassCDS)
FSAtab <- ftable(addmargins(FSAtab, margin=4, FUN=DeadPer1000), col.vars=3:4)
FSAtabf print(FSAtabf, digits=1)
Subsection 1.3.2: Summaries of information from data frames
## Individual vine yields, with means by block and treatment overlaid
<- DAAG::kiwishade
kiwishade $block <- factor(kiwishade$block, levels=c("west","north","east"))
kiwishade<- list(space="top", columns=2,
keyset text=list(c("Individual vine yields", "Plot means (4 vines)")),
points=list(pch=c(1,3), cex=c(1,1.35), col=c("gray40","black")))
<- function(x,y,...){panel.dotplot(x,y, pch=1, ...)
panelfun <- sapply(split(x,y),mean); ypos <- unique(y)
av lpoints(ypos~av, pch=3, col="black")}
dotplot(shade~yield | block, data=kiwishade, col="gray40", aspect=0.65,
panel=panelfun, key=keyset, layout=c(3,1))
## Note that parameter settings were given both in the calls
## to the panel functions and in the list supplied to key.
## mean yield by block by shade: data frame kiwishade (DAAG)
<- with(DAAG::kiwishade,
kiwimeans aggregate(yield, by=list(block, shade), mean))
names(kiwimeans) <- c("block","shade","meanyield")
head(kiwimeans, 4) # First 4 rows
Subsection 1.3.3: Measures of variation
Cuckoo eggs example
options(width=72)
## SD of length, by species: data frame cuckoos (DAAG)
<- with(cuckoos, sapply(split(length,species), function(x)c(sd(x),length(x))))
z print(setNames(paste0(round(z[1,],2)," (",z[2,],")"),
abbreviate(colnames(z),11)), quote=FALSE)
Subsection 1.3.4: Inter-quartile range (IQR) and median absolute deviation (MAD)
Subsection 1.3.5: A pooled standard deviation estimate
Elastic bands example
<
Subsection 1.3.6: Effect size
setNames(diff(c(ambient=244.1, heated=253.5))/c(sd=10.91), "Effect size")
Data are available in the data frame DAAG::two65
.
vignette('effectsize', package='effectsize')
Subsection 1.3.7: Correlation
set.seed(17)
<- x2 <- x3 <- (11:30)/5
x1 <- x1 + rnorm(20, sd=0.5)
y1 <- 2 - 0.05 * x1 + 0.1 * ((x1 - 1.75))^4 + rnorm(20, sd=1.5)
y2 <- (x1 - 3.85)^2 + 0.015 + rnorm(20)
y3 <- ((2 * pi) * (1:20))/20
theta <- 10 + 4 * cos(theta)
x4 <- 10 + 4 * sin(theta) + rnorm(20, sd=0.6)
y4 <- data.frame(x = c(rep(x1, 3), x4), y = c(y1, y2, y3, y4),
xy gp = factor(rep(1:4, rep(20, 4))))
<- split(xy, xy$gp)
xysplit <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("pearson"))))
rho <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("spearman"))))
rhoS <- as.list(setNames(round(c(rho,rhoS),2), paste0("r",1:8)))
rnam <- bquote(expression(paste(r==.(r1), " ",r[s]==.(r5)),
striplabs paste(r==.(r2), " ",r[s]==.(r6)),
paste(r==.(r3), " ",r[s]==.(r7)),
paste(r==.(r4), " ",r[s]==.(r8))), rnam)
xyplot(y ~ x | gp, data=xy, layout=c(4,1), xlab="", ylab="",
strip=strip.custom(factor.levels=striplabs), aspect=1,
scales=list(relation='free', draw=FALSE), between=list(x=0.5,y=0)
)
Section 1.4: Distributions: quantifying uncertainty
Subsection 1.4.1: Discrete distributions
## dbinom(0:10, size=10, prob=0.15)
setNames(round(dbinom(0:10, size=10, prob=0.15), 3), 0:10)
pbinom(q=4, size=10, prob=0.15)
qbinom(p = 0.70, size = 10, prob = 0.15)
## Check that this lies between the two cumulative probabilities:
## pbinom(q = 1:2, size=10, prob=0.15)
rbinom(15, size=4, p=0.5)
## dpois(x = 0:8, lambda = 3)
setNames(round(dpois(x = 0:8, lambda = 3),4), 0:8)
## Probability of > 8 raisins
## 1-ppois(q = 8, lambda = 3) ## Or, ppois(q=8, lambda=3, lower.tail=FALSE)
1 - ppois(q = 8, lambda = 3)
ppois(q=8, lambda=3, lower.tail=FALSE) ## Alternative
1-sum(dpois(x = 0:8, lambda = 3)) ## Another alternative
<- rpois(20, 3)
raisins raisins
Initializing the random number generator
set.seed(23286) # Use to reproduce the sample below
rbinom(15, size=1, p=0.5)
Subsection 1.4.2: Continuous distributions
<- seq(-3,3,length=101)
z plot(z, dnorm(z), type="l", ylab="Normal density",
yaxs="i", bty="L", tcl=-0.3, fg="gray",
xlab="Distance, in SDs, from mean", cex.lab=0.9)
polygon(c(z[z <= 1.0],1.0),c(dnorm(z[z <= 1.0]), dnorm(-3)), col="grey")
<- par()$cxy[2]
chh arrows(-1.8, 0.32, -0.25, 0.2, length=0.07, xpd=T)
<- round(pnorm(1), 3)
cump text(-1.8, 0.32+0.75*chh, paste("pnorm(1)\n", "=", cump), xpd=T, cex=0.8)
<- c('pnorm(0)', 'pnorm(1)', 'pnorm(-1.96)', 'pnorm(1.96)',
pnormExs 'pnorm(1.96, mean=2)', 'pnorm(1.96, sd=2)')
<- sapply(pnormExs, function(x)eval(parse(text=x)))
Prob <- as.data.frame(Prob)
df $Prob <- round(df$Prob,3)
dfprint(df)
## Plot the normal density, in the range -3 to 3
<- pretty(c(-3,3), 30) # Find ~30 equally spaced points
z <- dnorm(z) # Equivalent to dnorm(z, mean=0, sd=1)
ht plot(z, ht, type="l", xlab="Normal variate", ylab="Density", yaxs="i")
# yaxs="i" locates the axes at the limits of the data
qnorm(.9) # 90th percentile; mean=0 and SD=1
## Additional examples:
setNames(qnorm(c(.5,.841,.975)), nm=c(.5,.841,.975))
qnorm(c(.1,.2,.3)) # -1.282 -0.842 -0.524 (10th, 20th and 30th percentiles)
qnorm(.1, mean=100, sd=10) # 87.2 (10th percentile, mean=100, SD=10)
Generating simulated samples from the normal and other continuous distributions
options(digits=2) # Suggest number of digits to display
rnorm(10) # 10 random values from the normal distribution
<- 10
mu <- 1
sigma <- 1
n <- 50
m <- 4
four <- 5
nrep <- 21
seed <- 1
totrows if(is.null(totrows))
<- floor(sqrt(nrep))
totrows <- ceiling(nrep/totrows)
totcols <- range(pretty(mu + (c(-3.4, 3.4) * sigma), 50))
z <- data.frame(x=rep(0,nrep),y=rep(0,nrep),n=rep(n,nrep),
xy mm=rep(m,nrep),four=rep(four,nrep))
<- factor(paste("Simulation", 1:nrep),
fac <- paste("Simulation", 1:nrep))
lev <-z
xlim## ylim<-c(0,dnorm(0)*sqrt(n))
<- c(0,1)
ylim <- split(xy,fac)
xy <-lapply(1:length(xy),function(i){c(as.list(xy[[i]]), list(xlim=xlim,
xyylim=ylim))})
<- function(data, mu = 10, sigma = 1, n2 = 1,
panel.mean mm = 100, nrows, ncols, ...)
{<- function(x, y, lty = 1, col = 1)
vline lines(c(x, x), c(0, y), lty = lty, col = col)
<-data$n[1]
n2<-data$mm[1]
mm<-data$four[1] ## Four characters in each unit interval of x
our<- round(four * 4)
nmid <- array(0, 2 * nmid + 1)
nn #########################################
<- mu+seq(from=-3.4*sigma, to=3.4*sigma, length=mm)
z <-pretty(z)
atx<- pnorm((z - mu)/sigma)
qz <- dnorm((z - mu)/sigma)
dz <- sigma/four
chw <- strheight("O")*0.75
chh <- (mm * chh)/four
htfac if(nrows==1&&ncols==1)
lines(z, dz * htfac)
if(nrows==1)axis(1,at=atx, lwd=0, lwd.ticks=1)
<- rnorm(mm, mu, sigma/sqrt(n2))
y <- round((y - mu)/sigma * four)
pos for(i in 1:mm) {
+ pos[i]] <- nn[nmid + pos[i]] + 1
nn[nmid <- chw * pos[i]
xpos text(mu + xpos, nn[nmid + pos[i]] * chh - chh/4, "x")
}
}::panelplot(xy,panel=panel.mean,totrows=totrows,totcols=totcols,
DAAGoma=c(1.5, 0, rep(0.5,2)), fg='gray')
## The following gives a conventional histogram representations:
set.seed (21) # Use to reproduce the data in the figure
<- data.frame(x=rnorm(250), gp=rep(1:5, rep(50,5)))
df ::histogram(~x|gp, data=df, layout=c(5,1)) lattice
runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1)
rexp(n=10, rate=3) # 10 numbers, exponential, mean 1/3.
Subsection 1.4.3: Graphical checks for normality
<- t(as.matrix(DAAG::pair65))
tab rbind(tab,"heated-ambient"=tab[1,]-tab[2,])
## Normal quantile-quantile plot for heated-ambient differences,
## compared with plots for random normal samples of the same size
<- with(DAAG::pair65, DAAG::qreference(heated-ambient, nrep=10, nrows=2))
plt update(plt, scales=list(tck=0.4), xlab="")
Subsection 1.4.4: Population parameters and sample statistics
The sampling distribution of the mean
library(lattice)
## Generate n sample values; skew population
= function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
sampfun <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000,
gph FUN = mean, sampvals=sampfun, plot.type = "density")
<- DAAG::DAAGtheme(color=FALSE)
samptheme print(update(gph, scales=list(tck=0.4), layout = c(3,1),
par.settings=samptheme, main=list("A: Density curves", cex=1.25)),
position=c(0,0.5,1,1), more=TRUE)
= function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
sampfun <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000,
gph FUN = mean, sampvals=sampfun, plot.type = "qq")
print(update(gph, scales=list(tck=0.4), layout = c(3,1),
par.settings=samptheme,
main=list("B: Normal quantile-quantile plots", cex=1.25)),
position=c(0,0,1,0.5))
Subsection 1.4.5: The \(t\)-distribution
<- seq(from=-4.2, to = 4.2, length.out = 50)
x <- c(0, dnorm(0))
ylim 2] <- ylim[2]+0.1*diff(ylim)
ylim[<- dnorm(x)
h1 <- dt(x, 3)
h3 <- dt(x,8)
h8 plot(x, h1, type="l", xlab = "", xaxs="i", ylab = "", yaxs="i",
bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "A: Normal (t8 overlaid)", adj=-0.2)
lines(x, h8, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
mtext(side=2, line=2.0, "Probability density")
<- par()$cxy[2]
chh <- par()$usr[c(1,4)] + c(0, 0.6*chh)
topleft legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (8 d.f.)"), bty="n", cex=0.8)
plot(x, h1, type="l", xlab = "", xaxs="i",
ylab = "", yaxs="i", bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "B: Normal (t3 overlaid)", adj=-0.2)
lines(x, h3, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
## mtext(side=2, line=2.0, "Probability density")
legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (3 d.f.)"), bty="n", cex=0.8)
## Panels C and D
<- 0.975
cump <- seq(from=-3.9, to = 3.9, length.out = 50)
x <- c(0, dnorm(0))
ylim <- function(cump, dfun = dnorm, qfun=qnorm,
plotfun ytxt = "Probability density",
txt1="qnorm", txt2="", ...)
{<- dfun(x)
h plot(x, h, type="l", xlab = "", xaxs="i", xaxt="n",
ylab = ytxt, yaxs="i", bty="L", ylim=ylim, fg="gray",
...)axis(1, at=c(-2, 0), cex=0.8, lwd=0, lwd.ticks=1)
axis(1, at=c((-3):3), labels=F, lwd=0, lwd.ticks=1)
<- 1-cump
tailp <- qfun(cump)
z <- pretty(c(z,4),20)
ztail <- dfun(ztail)
htail polygon(c(z,z,ztail,max(ztail)), c(0,dfun(z),htail,0), col="gray")
text(0, 0.5*dfun(z)+0.08*dfun(0),
paste(round(tailp, 3), " + ", round(1-2*tailp,3),
"\n= ", round(cump, 3), sep=""), cex=0.8)
lines(rep(z, 2), c(0, dfun(z)))
lines(rep(-z, 2), c(0, dfun(z)), col="gray60")
<- par()$cxy[2]
chh arrows(z, -1.5*chh,z,-0.1*chh, length=0.1, xpd=T)
text(z, -2.5*chh, paste(txt1, "(", cump, txt2, ")", "\n= ",
round(z,2), sep=""), xpd=T)
<- z + .3
x1 <- dfun(x1)*0.35
y1 <- dfun(0)*0.2
y0 arrows(-2.75, y0, -x1, y1, length=0.1, col="gray60")
arrows(2.75, y0, x1, y1, length=0.1)
text(-2.75, y0+0.5*chh, tailp, col="gray60")
text(2.75, y0+0.5*chh, tailp)
}## ytxt <- "t probability density (8 d.f.)"
plotfun(cump=cump, cex.lab=1.05)
mtext(side=3, line=1.25, "C: Normal distribution", adj=-0.2)
<- "t probability density (8 d.f.)"
ytxt plotfun(cump=cump, dfun=function(x)dt(x, 8),
qfun=function(x)qt(x, 8),
ytxt="", txt1="qt", txt2=", 8", cex.lab=1.05)
mtext(side=3, line=1.25, "D: t distribution (8 df)", adj=-0.2)
qnorm(c(0.975,0.995), mean=0) # normal distribution
qt(c(0.975, 0.995), df=8) # t-distribution with 8 d.f.
Subsection 1.4.6: The likelihood, and maximum likelihood estimation
Section 1.5: Simple forms of regression model
Subsection 1.5.1: Line or curve?
<- DAAG::roller
roller t(cbind(roller, "depression/weight ratio"=round(roller[,2]/roller[,1],2)))
Using models to predict
<- DAAG::roller$depression
y <- DAAG::roller$weight
x <- c(reg = "A", lo = "B")
pretext for(curve in c("reg", "lo")) {
plot(x, y, xlab = "Roller weight (t)", xlim=c(0,12.75), fg="gray",
ylab = "Depression in lawn (mm)", type="n")
points(x, y, cex=0.8, pch = 4)
mtext(side = 3, line = 0.25, pretext[curve], adj = 0)
<- par()$usr[c(1, 4)]
topleft <- strwidth("O"); chh <- strheight("O")
chw points(topleft[1]+rep(0.75,2)*chw,topleft[2]-c(0.75,1.8)*chh,
pch=c(4,1), col=c("black","gray40"), cex=0.8)
text(topleft[1]+rep(1.2,2)*chw, topleft[2]-c(0.75,1.8)*chh,
c("Data values", "Fitted values"),adj=0, cex=0.8)
if(curve=="lo")
text(topleft[1]+1.2*chw, topleft[2]-2.85*chh,"(smooth)", adj=0, cex=0.8)
if(curve[1] == "reg") {
<- lm(y ~ -1 + x)
u abline(0, u$coef[1])
<- predict(u)
yhat
}else {
<-lm(y~x+I(x^2))
lawn.lm<-predict(lawn.lm)
yhat<-pretty(x,20)
xnew<-lawn.lm$coef
b<-b[1]+b[2]*xnew+b[3]*xnew^2
ynewlines(xnew,ynew)
}<- y < yhat
here <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
yyhat <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
xx lines(xx, yyhat, lty = 2, col="gray")
<- y > yhat
here <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
yyhat <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
xx lines(xx, yyhat, lty = 1, col="gray")
<- length(y)
n <- min((1:n)[y - yhat >= 0.75*max(y - yhat)])
ns <- 0.5 * (y[ns] + yhat[ns])
ypos <- par()$cxy[1]
chw text(x[ns] - 0.25*chw, ypos, "+ve residual", adj = 1,cex=0.75, col="gray30")
points(x, yhat, pch = 1, col="gray40")
<- (1:n)[y - yhat == min(y - yhat)][1]
ns <- 0.5 * (y[ns] + yhat[ns])
ypos text(x[ns] + 0.4*chw, ypos, "-ve residual", adj = 0,cex=0.75,col="gray30")
}
Subsection 1.5.2: Fitting models – the model formula
## Fit line - by default, this fits intercept & slope.
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm ## Compare with the code used to plot the data
plot(depression ~ weight, data=DAAG::roller)
## Add the fitted line to the plot
abline(roller.lm)
## For a model that omits the intercept term, specify
lm(depression ~ 0 + weight, data=roller) # Or, if preferred, replace `0` by `-1`
Model objects
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm names(roller.lm) # Get names of list elements
coef(roller.lm) # Extract coefficients
summary(roller.lm) # Extract model summary information
coef(summary(roller.lm)) # Extract coefficients and SEs
fitted(roller.lm) # Extract fitted values
predict(roller.lm) # Predictions for existing or new data, with SE
# or confidence interval information if required.
resid(roller.lm) # Extract residuals
$coef # An alternative is roller.lm[["coef"]] roller.lm
print(summary(roller.lm), digits=3)
Residual plots
## Normal quantile-quantile plot, plus 7 reference plots
::qreference(residuals(roller.lm), nrep=8, nrows=2, xlab="") DAAG
Simulation of regression data
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm <- simulate(roller.lm, nsim=20) # 20 simulations roller.sim
with(DAAG::roller, matplot(weight, roller.sim, pch=1, ylim=range(depression)))
points(DAAG::roller, pch=16)
Subsection 1.5.3: The model matrix in regression
model.matrix(roller.lm)
## Specify coef(roller.lm) to obtain the column multipliers.
From straight line regression to multiple regression
<- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse.lm coef(summary(mouse.lm))
Section 1.6: Data-based judgments – frequentist, in a Bayesian world
Subsection 1.6.1: Inference with known prior probabilities
## `before` is the `prevalence` or `prior`.
<- function(prevalence, sens, spec){
after <- sens*prevalence + (1-spec)*(1-prevalence)
prPos *prevalence/prPos}
sens## Compare posterior for a prior of 0.002 with those for 0.02 and 0.2
setNames(round(after(prevalence=c(0.002, 0.02, 0.2), sens=.8, spec=.95), 3),
c("Prevalence=0.002", "Prevalence=0.02", "Prevalence=0.2"))
Relating ‘incriminating’ evidence to the probability of guilt
Subsection 1.6.2: Treatment differences that are on a continuous scale
## Use pipe syntax, introduced in R 4.1.0
<- with(datasets::sleep, extra[group==2] - extra[group==1])
sleep |> (function(x)c(mean = mean(x), SD = sd(x), n=length(x)))() |>
sleep function(x)c(x, SEM=x['SD']/sqrt(x['n'])))() |>
(setNames(c("mean","SD","n","SEM")) -> stats
print(stats, digits=3)
## Sum of tail probabilities
2*pt(1.580/0.389, 9, lower.tail=FALSE)
## 95% CI for mean of heated-ambient: data frame DAAG::pair65
t.test(sleep, conf.level=0.95)
An hypothesis test
pt(4.06, 9, lower.tail=F)
Subsection 1.6.3: Use of simulation with \(p\)-values
<- function(eff=c(.2,.4,.8,1.2), n=c(10,40), numreps=100,
eff2stat FUN=function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1,
lower.tail=FALSE)){
<- function(eff=c(.2,.4,.8,1.2), N=10, nrep=100, FUN){
simStat <- N*nrep*length(eff)
num array(rnorm(num, mean=eff), dim=c(length(eff),nrep,N)) |>
apply(2:1, FUN, N=N)
}<- matrix(nrow=numreps*length(eff),ncol=length(n))
mat for(j in 1:length(n)) mat[,j] <-
as.vector(simStat(eff, N=n[j], numreps, FUN=FUN)) ## length(eff)*numep
data.frame(effsize=rep(rep(eff, each=numreps), length(n)),
N=rep(n, each=numreps*length(eff)), stat=as.vector(mat))
}
set.seed(31)
<- eff2stat(eff=c(.2,.4,.8,1.2), n=c(10, 40), numreps=200)
df200 <- c(0.001,0.01,0.05,0.2,0.4,0.8)
labx <- bwplot(factor(effsize) ~ I(stat^0.25) | factor(N), data=df200,
gph layout=c(2,1), xlab="P-value", ylab="Effect size",
scales=list(x=list(at=labx^0.25, labels =labx)))
update(gph+latticeExtra::layer(panel.abline(v=labx[1:3]^0.25, col='lightgray')),
strip=strip.custom(factor.levels=paste0("n=",c(10,40))),
par.settings=DAAG::DAAGtheme(color=F, col.points="gray50"))
<- with(subset(df200, N==10&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)]))
eff10 <- with(subset(df200, N==40&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)])) eff40
Subsection 1.6.4: Power — minimizing the chance of false positives
<- rbind('R=0.2'=c(0.8*50, 0.05*250),
tf1 'R=1'=c(0.8*150, 0.05*150),
'R=5'=c(0.8*200, 0.05*50))
<- rbind(c('0.8 x50', '0.05x250'),
tf2 c('0.8x150', '0.05x150'),
c('0.8x250', '0.05 x50'))
<- cbind("True positives"=paste(tf2[,2],tf1[,2],sep="="),
tf "False positives"=paste(tf2[,1],tf1[,1],sep="="))
rownames(tf) <- rownames(tf1)
print(tf, quote=FALSE)
Power calculations – examples
power.t.test(d=0.5, sig.level=0.05, type="one.sample", power=0.8)
<- power.t.test(d=0.5, sig.level=0.005, type="one.sample", power=0.8)
pwr1 <- power.t.test(d=0.5, sig.level=0.005, type="two.sample", power=0.8)
pwr2 ## d=0.5, sig.level=0.005, One- and two-sample numbers
c("One sample"=pwr1$n, "Two sample"=pwr2$n)
<- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
effsize .05 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
pwr0type='one.sample')$power
.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
pwr0type='one.sample')$power}
<- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab 1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000' tab[
print(tab[,1:3], quote=F)
print(tab[,4:6], quote=F)
<- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
effsize .05 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
pwr0type='one.sample')$power
.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
pwr0type='one.sample')$power}
<- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab 1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000' tab[
Positive Predictive Values
<- pretty(0:3, 40)
R <- outer(R/0.05,c(.8,.3,.08))
postOdds <- as.data.frame(cbind(R,postOdds/(1+postOdds)))
PPV names(PPV) <- c("R","p80","p30","p8")
<- list(text = list(text=c("80% power","30% power", "8% power"), cex = 1.0),
key x = .6, y = .25, color=F)
<- lattice::xyplot(p80+p30+p8~R, data=PPV, lwd=2, type=c("l","g"),
gph xlab="Pre-study odds R", ylab="Post-study probability (PPV)")
update(gph, scales=list(tck=0.5), key=key)
Subsection 1.6.5: The future for \(p\)-values
Subsection 1.6.6: Reporting results
Section 1.7: Information statistics and Bayesian methods with Bayes Factors
Subsection 1.7.1: Information statistics – using likelihoods for model choice
## Calculations using mouse brain weight data
<- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse.lm <- nrow(DAAG::litters)
n <- with(mouse.lm, n*(log(sum(residuals^2)/n)+1+log(2*pi)))
RSSlogLik <- length(coef(mouse.lm))+1 # NB: p=4 (3 coefficients + 1 scale parameter)
p <- 2*n/(n-p-1)
k c("AICc" = AICcmodavg::AICc(mouse.lm), fromlogL=k*p-2*logLik(mouse.lm)[1],
fromFit=k*p + RSSlogLik) |> print(digits=4)
The sampling properties of the difference in AIC statistics
<- function(mu=0, n=15, ntimes=200){
sim0vs1 <- a1 <- numeric(ntimes)
a0 for(i in 1:ntimes){
<- rnorm(n, mean=mu, sd=1)
y <- lm(y ~ 0); m1 <- lm(y ~ 1)
m0 <- AIC(m0); a1[i] <- AIC(m1)
a0[i]
}data.frame(a0=a0, a1=a1, diff01=a0-a1, mu=rep(paste0("mu=",mu)))
}library(latticeExtra)
<- sim0vs1(mu=0)
sim0 .5 <- sim0vs1(mu=0.5)
sim0<- rbind(sim0, sim0.5)
simboth <- with(list(n=15, p=2), 2*(p+1)*p/(n-(p+1)-1))
cdiff xyplot(diff01 ~ a0 | mu, data=simboth, xlab="AIC(m0)", ylab="AIC(m0) - AIC(m1)") +
::layer({panel.abline(h=0, col='red');
latticeExtrapanel.abline(h=cdiff, lwd=1.5, lty=3, col='red', alpha=0.5);
panel.abline(h=-2, lty=2, col='red')})
<- rbind(c(with(sim0, sum(diff01>0))/200, with(sim0.5, sum(diff01>0))/200),
tab c(with(sim0,sum(diff01>-cdiff))/200, with(sim0.5, sum(diff01>-cdiff))/200))
dimnames(tab) <- list(c("AIC: Proportion choosing m1",
"AICc: Proportion choosing m1"),
c("True model is m0", "True model is m1"))
tab
Subsection 1.7.2: Bayesian methods with Bayes Factors
## Setting `scale=1/sqrt(2)` gives a mildly narrower distribution
print(c("pcauchy(1, scale=1)"=pcauchy(1, scale=1),
" pcauchy(1, scale=1/sqrt(2))"=pcauchy(1, scale=1/sqrt(2))),
quote=FALSE)
The Cauchy prior with different choices of scale parameter
<- seq(from=-4.5, to=4.5, by=0.1)
x <- dcauchy(x,scale=sqrt(2)/2)
densMed <- dcauchy(x, scale=sqrt(2))
densUltra <- dnorm(x, sd=1)
denn plot(x,densMed, type='l', mgp=c(2,0.5,0), xlab="",
ylab="Prior density", col="red", fg='gray')
mtext(side=1, line=2, expression("Effect size "==phantom(0)*delta), cex=1.1)
lines(x, denn, col="blue", lty=2)
lines(x, densUltra,col=2, lty=2)
legend("topleft", title="Normal prior",
y.intersp=0.8, lty=2, col="blue", bty='n', cex=0.8,
legend=expression(bold('sd=1')))
legend("topright", title="Cauchy priors", y.intersp=0.8,
col=c('red', 'red'),lty=c(1,2), cex=0.8,
legend=c(expression(bold('medium')),
expression(bold('ultrawide'))),bty="n")
mtext(side=3, line=0.25, adj=0, cex=1.15,
expression("A: Alternative priors for "*delta==frac(mu,sigma)))
## Panel B
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 <- BayesFactor::posterior(ttBF0, iterations=10000)
simpost plot(density(simpost[,'mu']), main="", xlab="", col="red",
mgp=c(2,0.5,0), ylab="Posterior density", fg='gray')
mtext(side=1, line=2, expression(mu), cex=1.1)
abline(v=mean(pairedDiffs), col="gray")
mtext(side=3, line=0.5, expression("B: Posterior density for "*mu), adj=0, cex=1.15)
## Calculate and plot density for default prior - Selected lines of code
<- seq(from=-4.5, to=4.5, by=0.1)
x <- dcauchy(x, scale=sqrt(2)/2)
densMed plot(x, densMed, type='l')
## Panel B
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 ## Sample from posterior, and show density plot for mu
<- BayesFactor::posterior(ttBF0, iterations=10000)
simpost plot(density(simpost[,'mu']))
A thought experiment
<- setNames(qt(1-c(.05,.01,.005)/2, df=19), paste(c(.05,.01,.005)))
tval <- setNames(numeric(3), paste(c(.05,.01,.005)))
bf01 for(i in 1:3)bf01[i] <- BayesFactor::ttest.tstat(tval[i],n1=20, simple=T)
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 <- BayesFactor::ttestBF(pairedDiffs, rscale=1)
ttBFwide <- BayesFactor::ttestBF(pairedDiffs, rscale=sqrt(2))
ttBFultra <- c("medium"=sqrt(2)/2, "wide"=1, ultrawide=sqrt(2))
rscales <- c(as.data.frame(ttBF0)[['bf']], as.data.frame(ttBFwide)[['bf']],
BF3 as.data.frame(ttBFultra)[['bf']])
setNames(round(BF3,2), c("medium", "wide", "ultrawide"))
<- t.test(pairedDiffs)[['p.value']]
pval 1/(-exp(1)*pval*log(pval))
A null interval may make better sense
<- round(0.75/sd(pairedDiffs),2) ## Use standardized units
min45 <- BayesFactor::ttestBF(pairedDiffs, nullInterval=c(-min45,min45))
ttBFint round(as.data.frame(ttBFint)['bf'],3)
<- as.data.frame(ttBFint)[['bf']] bf01
The effect of changing sample size
<- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
t2bfInterval <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
null0 rscale=rscale,simple=TRUE)
<- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale,
alt0 complement=TRUE, simple=TRUE)
/null0
alt0
}##
## Calculate Bayes factors
<- c(0.05,0.01,0.001); nval <- c(4,6,10,20,40,80,160)
pval <- expand.grid(p=pval, n=nval)
bfDF <- 1; ncol <- 2; tcol <- 3
pcol 't'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1, lower.tail=FALSE)})
bfDF[,<- apply(bfDF,1,function(x)
other c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
simple=TRUE),
## Now specify a null interval
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium")
))<- setNames(cbind(bfDF, t(other)),
bfDF c('p','n','t','bf','bfInterval'))
<- with(subset(bfDF, n==max(bfDF$n)), log((bf+bfInterval)/2))
plabpos <- lattice::xyplot(log(bf)~log(n), groups=factor(p), data=bfDF,
gphA1 panel=function(x,y,...){
::panel.xyplot(x,y,type='b',...)})
lattice<- 10^((-3):6/2)
ylabA <- list(x=list(at=log(nval), labels=nval),
scalesA y=list(at=log(ylabA), labels=signif(ylabA,2)))
<- list(corner=c(0.99,0.98), lines=list(col=c(1,1), lty=1:2),
keyA text=list(c('Point null at 0', "null=(-0.1,0.1)")))
<- log(c(min(bfDF[['bfInterval']])-0.05,150))
ylim2 <- lattice::xyplot(log(bfInterval)~log(n), groups=factor(p), lty=2,
gphA2 xlim=c(log(3.5),log(max(nval)*3.25)), ylim=ylim2, data=bfDF,
panel=function(x,y,...){
panel.xyplot(x,y,type='b',...)
panel.grid(h=-1,v=-1)
panel.text(rep(log(max(nval*0.975)),3), plabpos,
labels=c('p=0.05','0.01','0.001'), pos=4)
},par.settings=DAAG::DAAGtheme(color=T),
main="A: Bayes factor vs sample size",
xlab="Sample size", ylab="Bayes factor", scales=scalesA, key=keyA)
## Panel B
'eff']] = bfDF[["t"]]/sqrt(bfDF[['n']])
bfDF[[<- 10^((-3):2/3)
ylabB = list(x=list(at=log(nval), labels=nval),
scalesBy=list(at=log(ylabB), labels=signif(ylabB,2)))
<- list(corner=c(0.98,0.975), lines=list(lty=1:3),
keyB points=list(pch=1:3), text=list(c('p=0.001','p=0.01','p=0.05')))
<- xyplot(log(eff)~log(n), groups=log(p), data=bfDF, pch=1:3, lty=1:3,
gphB type='b', xlab="Sample size", ylab="Effect size",
par.settings=DAAG::DAAGtheme(color=T),
main="B: Effect size vs sample size", key=keyB, scales=scalesB) +
::layer(panel.grid(h=-1,v=-1))
latticeExtraplot(gphA2+latticeExtra::as.layer(gphA1), position=c(0, 0, 0.525, 1), more=T)
plot(gphB, position=c(0.52, 0, 1, 1), par.settings=DAAG::DAAGtheme(color=T))
Different statistics give different perspectives
<- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
n1 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T) n2
<- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
bf1 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T)
bf2 rbind("Bayes Factors"=setNames(c(bf1,bf2), c("p=0.00001","p=0.000001")),
"t-statistics"=c(qt(0.00001, df=40), qt(0.000001, df=40)))
::kable(matrix(c("A bare mention","Positive","Strong","Very strong"), nrow=1),
knitrcol.names=c("1 -- 3", "3 -- 20", "20 -- 150", ">150"), align='c',
midrule='', vline="")
Section 1.8: Resampling methods for SEs, tests and confidence intervals
Subsection 1.8.1: The one-sample permutation test
<- t(as.matrix(DAAG::pair65))
tab rbind(tab,"heated-ambient"=tab[1,]-tab[2,])
Subsection 1.8.2: The two-sample permutation test
## First of 3 curves; permutation distribution of difference in means
<- DAAG::two65
two65 set.seed(47) # Repeat curves shown here
<- 2000; dsims <- numeric(nsim)
nsim <- with(two65, c(ambient, heated))
x <- length(x); n2 <- length(two65$heated)
n <- with(two65, mean(heated)-mean(ambient))
dbar for(i in 1:nsim){
<- sample(n,n2,replace=FALSE); dsims[i] <- mean(x[mn]) - mean(x[-mn]) }
mn plot(density(dsims), xlab="", main="", lwd=0.5, yaxs="i", ylim=c(0,0.08), bty="n")
abline(v=c(dbar, -dbar), lty=3)
<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval1 mtext(side=3,line=0.25,
text=expression(bar(italic(x))[2]-bar(italic(x))[1]), at=dbar)
mtext(side=3,line=0.25,
text=expression(-(bar(italic(x))[2] - bar(italic(x))[1])), at=-dbar)
## Second permutation density
for(i in 1:nsim){
<- sample(n,n2,replace=FALSE)
mn <- mean(x[mn]) - mean(x[-mn])
dsims[i]
}<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval2 lines(density(dsims),lty=2,lwd=1)
## Third permutation density
for(i in 1:nsim){
<- sample(n,n2,replace=FALSE)
mn <- mean(x[mn]) - mean(x[-mn])
dsims[i]
}<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval3 lines(density(dsims),lty=3,lwd=1.25)
box(col="gray")
<- paste(c(pval1,pval2,pval3))
leg3 legend(x=20, y=0.078, title="P-values are", cex=1, xpd=TRUE,
bty="n", lty=c(1,2,3), lwd=c(1,1,1,1.25), legend=leg3, y.intersp=0.8)
Subsection 1.8.3: Estimating the standard error of the median: bootstrapping
## Bootstrap estimate of median of wren length: data frame cuckoos
<- subset(DAAG::cuckoos, species=="wren")[, "length"]
wren library(boot)
## First define median.fun(), with two required arguments:
## data specifies the data vector,
## indices selects vector elements for each resample
<- function(data, indices){median(data[indices])}
median.fun ## Call boot(), with statistic=median.fun, R = # of resamples
set.seed(23)
<- boot(data = wren, statistic = median.fun, R = 4999)) (wren.boot
Subsection 1.8.4: Bootstrap estimates of confidence intervals
Bootstrap 95% confidence intervals for the median
## Call the function boot.ci() , with boot.out=wren.boot
boot.ci(boot.out=wren.boot, type=c("perc","bca"))
The correlation coefficient
## Bootstrap estimate of 95% CI for `cor(chest, belly)`: `DAAG::possum`
<- function(data, indices)
corr.fun with(data, cor(belly[indices], chest[indices]))
set.seed(29)
<- boot(DAAG::possum, corr.fun, R=9999) corr.boot
library(boot)
boot.ci(boot.out = corr.boot, type = c("perc", "bca"))
Section 1.9: Organizing and managing work, and tools that can assist
Subsection 1.9.1: Reproducible reporting — the knitr package
Section 1.10: The changing environment for data analysis
Subsection 1.10.1: Models and machine learning
Subsection 1.10.2: Replicability is the definitive check
Section 1.11: Further, or supplementary, reading
Exercises (1_12)
1.4
<- MASS::Animals
Animals <- rbind(Animals, sqrt(Animals), Animals^0.1, log(Animals))
manyMals $transgp <- rep(c("Untransformed", "Square root transform",
manyMals"Power transform, lambda=0.1", "log transform"),
rep(nrow(Animals),4))
$transgp <- with(manyMals, factor(transgp, levels=unique(transgp)))
manyMals::xyplot(brain~body|transgp, data=manyMals,
latticescales=list(relation='free'), layout=c(2,2))
1.5
with(Animals, c(cor(brain,body), cor(brain,body, method="spearman")))
with(Animals, c(cor(log(brain),log(body)),
cor(log(brain),log(body), method="spearman")))
1.9
<- DAAG::cuckoohosts[c(1:6,8),]
usableDF <- nrow(usableDF)
nr with(usableDF, {
plot(c(clength, hlength), c(cbreadth, hbreadth), col=rep(1:2,c(nr,nr)))
for(i in 1:nr)lines(c(clength[i], hlength[i]), c(cbreadth[i], hbreadth[i]))
text(hlength, hbreadth, abbreviate(rownames(usableDF),8), pos=c(2,4,2,1,2,4,2))
})
1.10
## Take a random sample of 100 values from the normal distribution
<- rnorm(100, mean=3, sd=5)
x <- mean(x))
(xbar ## Plot, against `xbar`, the sum of squared deviations from `xbar`
<- function(xbar) apply(outer(x, xbar, "-")^2, 2, sum)
lsfun curve(lsfun, from=xbar-0.01, to=xbar+0.01)
boxplot(avs, meds, horizontal=T)
1.15
<- rpois(7, 78.3)
x mean(x); var(x)
1.16
<- rnorm(100)
nvals100 <- rt(100, df = 4)
heavytail <- rt(100, df = 2)
veryheavytail boxplot(nvals100, heavytail, veryheavytail, horizontal=TRUE)
1.19
<- function(n=1000, times=10){
boxdists <- data.frame(normal=rnorm(n*times), t=rt(n*times, 7),
df <- rep(1:times, rep(n,times)))
sampnum ::bwplot(sampnum ~ normal+t, data=df, outer=TRUE, xlab="",
latticehorizontal=T)
}
1.20
<- 1
a <- ~rchisq(1000,1)^a+rchisq(1000,25)^a+rchisq(1000,500)^a
form ::qqmath(form, scales=list(relation="free"), outer=TRUE) lattice
1.21
<- rnorm(51)
y <- y1[-1] + y1[-51]
ydep acf(y) # acf plots `autocorrelation function'(see Chapter 6)
acf(ydep)
1.24
<- function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1, lower.tail=FALSE)
ptFun <- function(eff=.4, N=10, nrep=200, FUN)
simStat array(rnorm(n=N*nrep*length(eff), mean=eff), dim=c(length(eff),nrep,N)) |>
apply(2:1, FUN, N=N)
<- simStat(eff=.4, N=10, nrep=200, FUN=ptFun)
pval # Suggest a power transform that makes the distribution more symmetric
::powerTransform(pval) # See Subsection 2.5.6
car<- c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25)
labx bwplot(~I(pval^0.2), scales=list(x=list(at=labx^0.2, labels=paste(labx))),
xlab=expression("P-value (scale is "*p^{0.2}*")") )
1.24a
<- subset(df200, effsize==0.4 & N==10)$stat
pvalDF plot(sort(pval^0.2), sort(pvalDF^0.2))
abline(0,1)
1.24c
## Estimated effect sizes: Set `FUN=effFun` in the call to `eff2stat()`
<- function(x,N)mean(x)/sd(x)
effFun # Try: `labx <- ((-1):6)/2`; `at = log(labx)`; `v = log(labx)
## NB also, Bayes Factors: Set `FUN=BFfun` in the call to `eff2stat()`
<- function(x,N)BayesFactor::ttest.tstat(sqrt(N)*mean(x)/sd(x),
BFfun n1=N, simple=T)
# A few very large Bayes Factors are likely to dominate the plots
1.27
<- setNames(c(21,30,38,46),paste('rep',1:4)) ) (degC
1.27a
<- tidyr::pivot_longer(MPV::radon, names_to='key',
radonC cols=names(degC), values_to='percent')
$temp <- degC[radonC$key]
radonC::xyplot(percent ~ temp|factor(diameter), data = radonC) lattice
matplot(scale(t(MPV::radon[,-1])), type="l", ylab="scaled residuals")
1.27d
<- aggregate(percent ~ diameter, data = radonC, FUN = scale,
radon.res scale = FALSE)
1.30
<- ggplot2::diamonds
diamonds with(diamonds, plot(carat, price, pch=16, cex=0.25))
with(diamonds, smoothScatter(carat, price))
<- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
t2bfInterval <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
null0 rscale=rscale,simple=TRUE)
<- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale,
alt0 complement=TRUE, simple=TRUE)
/null0
alt0 }
<- c(0.05,0.01,0.001); nval <- c(10,40,160)
pval <- expand.grid(p=pval, n=nval)
bfDF <- 1; ncol <- 2; tcol <- 3
pcol 't'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1, lower.tail=FALSE)})
bfDF[,<- apply(bfDF,1,function(x)
other c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
simple=TRUE),
::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="wide",
BayesFactorsimple=TRUE),
## Now specify a null interval
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium"),
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="wide")
))<- setNames(cbind(bfDF, t(other)),
bfDF c('p','n','t','bf','bfInterval'))
<- data.frame(d = with(datasets::sleep, extra[group==2] - extra[group==1]))
df library(statsr)
::ttestBF(df$d, rscale=1/sqrt(2)) # Or, `rscale="medium"`
BayesFactor# `rscale="medium"` is the default
bayes_inference(d, type='ht', data=df, statistic='mean', method='t', rscale=1/sqrt(2),
alternative='twosided', null=0, prior_family = "JZS")
# Set `rscale=1/sqrt(2)` (default is 1.0)
# as for BayesFactor; gives same BF
# Compare with `prior_family = "JUI"` (`"JZS"` is the default),
# with (if not supplied) default settings
bayes_inference(d, type='ht', data=df, statistic='mean', method='t',
alternative='twosided', null=0, prior_family = "JUI")
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch1.R")
}