Binary data

data(wcgs, package = "faraway")
dim(wcgs)
## [1] 3154   13
head(wcgs) #not run: str(wcgs); plot(wcgs); help(wcgs)
##      age height weight sdp dbp chol behave cigs dibep chd  typechd timechd
## 2001  49     73    150 110  76  225     A2   25     B  no     none    1664
## 2002  42     70    160 154  84  177     A2   20     B  no     none    3071
## 2003  42     69    160 110  78  181     B3    0     A  no     none    3071
## 2004  41     68    152 124  78  132     B4   20     A  no     none    3064
## 2005  59     70    150 144  86  255     B3   20     A yes infdeath    1885
## 2006  44     72    204 150  90  182     B4    0     A  no     none    3102
##        arcus
## 2001  absent
## 2002 present
## 2003  absent
## 2004  absent
## 2005 present
## 2006  absent
sum(is.na(wcgs)) # there are a few missing values; much further sleuthing finds arcus and chol have some missing values
## [1] 14

Fitting a binary model: The first fit is silly, luckily R gives a warning.

glm(chd ~ ., family = binomial, data = wcgs)
## Warning: glm.fit: algorithm did not converge
## 
## Call:  glm(formula = chd ~ ., family = binomial, data = wcgs)
## 
## Coefficients:
##     (Intercept)              age           height           weight  
##        2.66e+01        -1.58e-14        -1.07e-13         1.02e-14  
##             sdp              dbp             chol         behaveA2  
##        1.97e-14        -2.10e-14         8.99e-16        -4.35e-13  
##        behaveB3         behaveB4             cigs           dibepB  
##       -1.82e-13        -1.21e-13        -6.69e-15               NA  
## typechdinfdeath      typechdnone    typechdsilent          timechd  
##        1.36e-08        -5.31e+01        -1.80e-07         9.53e-16  
##    arcuspresent  
##        3.48e-13  
## 
## Degrees of Freedom: 3139 Total (i.e. Null);  3124 Residual
##   (14 observations deleted due to missingness)
## Null Deviance:       1770 
## Residual Deviance: 0.0000000182  AIC: 32

It turns out that typechd and timechd are confounded with chd; and that behave and dibepB are the same variable recorded differently. So we’ll try:

heartmod <- glm(chd ~ age + height + weight + sdp + dbp + chol + behave + cigs, family = binomial, data = wcgs)
sumary(heartmod)
##               Estimate Std. Error z value         Pr(>|z|)
## (Intercept) -12.989034   2.336470   -5.56 0.00000002709290
## age           0.065068   0.012125    5.37 0.00000008025581
## height        0.015824   0.033141    0.48           0.6330
## weight        0.007874   0.003881    2.03           0.0425
## sdp           0.017536   0.006395    2.74           0.0061
## dbp           0.000121   0.010834    0.01           0.9911
## chol          0.011076   0.001522    7.28 0.00000000000034
## behaveA2      0.088266   0.222589    0.40           0.6917
## behaveB3     -0.602705   0.244216   -2.47           0.0136
## behaveB4     -0.499214   0.320978   -1.56           0.1199
## cigs          0.020998   0.004280    4.91 0.00000093160928
## 
## n = 3142 p = 11
## Deviance = 1580.45 Null Deviance = 1779.20 (Difference = 198.75)
heartmod2 <- update(heartmod, . ~ . - behave)
sumary(heartmod2)
##              Estimate Std. Error z value         Pr(>|z|)
## (Intercept) -13.84290    2.31823   -5.97 0.00000000235323
## age           0.06896    0.01204    5.73 0.00000001030382
## height        0.01856    0.03310    0.56           0.5749
## weight        0.00827    0.00385    2.15           0.0316
## sdp           0.01806    0.00633    2.85           0.0043
## dbp           0.00126    0.01077    0.12           0.9067
## chol          0.01118    0.00151    7.42 0.00000000000012
## cigs          0.02299    0.00426    5.40 0.00000006637269
## 
## n = 3142 p = 8
## Deviance = 1601.96 Null Deviance = 1779.20 (Difference = 177.24)
anova(heartmod2, heartmod, test = "LRT") #default for test is NULL
## Analysis of Deviance Table
## 
## Model 1: chd ~ age + height + weight + sdp + dbp + chol + cigs
## Model 2: chd ~ age + height + weight + sdp + dbp + chol + behave + cigs
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1      3134       1602                         
## 2      3131       1580  3     21.5 0.000082 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a significant effect of behaviour; a check using levels(wcgs$behave) shows that it has levels A1, A2, B1, B2. The variable dibep collapses these to A and B.

heartmod3 <- update(heartmod, .~ . - behave + dibep )
sumary(heartmod3)
##              Estimate Std. Error z value         Pr(>|z|)
## (Intercept) -13.55457    2.32014   -5.84 0.00000000515355
## age           0.06477    0.01210    5.35 0.00000008610612
## height        0.01600    0.03312    0.48           0.6289
## weight        0.00782    0.00388    2.02           0.0437
## sdp           0.01772    0.00637    2.78           0.0054
## dbp          -0.00015    0.01082   -0.01           0.9890
## chol          0.01106    0.00152    7.27 0.00000000000036
## cigs          0.02092    0.00427    4.90 0.00000098078442
## dibepB        0.65338    0.14522    4.50 0.00000681630545
## 
## n = 3142 p = 9
## Deviance = 1580.74 Null Deviance = 1779.20 (Difference = 198.46)

In Sections 2.1-2.4 of ELM-2, just two covariates (i.e. potential explanatory variables) are used: height and cigs.

lmod <- glm(chd ~ height + cigs, family = binomial, data = wcgs)
beta <- coef(lmod)
plot(jitter(as.numeric(chd)-1, amount = 0.1) ~ jitter(cigs), wcgs, xlab="cigs", ylab = "Heart Disease", pch=".")
curve(ilogit(beta[1]+beta[2]*60 + beta[3]*x), add=TRUE)
curve(ilogit(beta[1]+beta[2]*78 + beta[3]*x), add=TRUE, lty=2, col="blue")

Inference for \(\beta\) follows as in the challenger shuttle example; the only difference is that our response is binary instead of binomial. This makes the raw residuals \(y - \hat y\) look weird, because we’re always subtracting from either \(1\) or \(0\). In Section 2.4, Faraway groups the residuals and plots the average within each group

plot.glm.diag(heartmod3)

par(mfrow=c(2,2))
plot(heartmod3)

For model selection, \(AIC = 2 \text{maximum log-likelihood} + 2p\) is very popular. The maximized value of the log-likelihood function plays the role of the minimized residual sum of squares.

heartmod4 <- MASS::stepAIC(heartmod3)
## Start:  AIC=1599
## chd ~ age + height + weight + sdp + dbp + chol + cigs + dibep
## 
##          Df Deviance  AIC
## - dbp     1     1581 1597
## - height  1     1581 1597
## <none>          1581 1599
## - weight  1     1585 1601
## - sdp     1     1588 1604
## - dibep   1     1602 1618
## - cigs    1     1604 1620
## - age     1     1609 1625
## - chol    1     1635 1651
## 
## Step:  AIC=1597
## chd ~ age + height + weight + sdp + chol + cigs + dibep
## 
##          Df Deviance  AIC
## - height  1     1581 1595
## <none>          1581 1597
## - weight  1     1585 1599
## - sdp     1     1598 1612
## - dibep   1     1602 1616
## - cigs    1     1604 1618
## - age     1     1609 1623
## - chol    1     1635 1649
## 
## Step:  AIC=1595
## chd ~ age + weight + sdp + chol + cigs + dibep
## 
##          Df Deviance  AIC
## <none>          1581 1595
## - weight  1     1589 1601
## - sdp     1     1598 1610
## - dibep   1     1602 1614
## - cigs    1     1605 1617
## - age     1     1609 1621
## - chol    1     1635 1647
sumary(heartmod4)
##              Estimate Std. Error z value         Pr(>|z|)
## (Intercept) -12.54569    0.96018  -13.07          < 2e-16
## age           0.06442    0.01208    5.33 0.00000009556529
## weight        0.00881    0.00317    2.78           0.0054
## sdp           0.01739    0.00409    4.26 0.00002075666391
## chol          0.01098    0.00151    7.26 0.00000000000038
## cigs          0.02114    0.00422    5.01 0.00000055114653
## dibepB        0.65421    0.14514    4.51 0.00000655778148
## 
## n = 3142 p = 7
## Deviance = 1580.98 Null Deviance = 1779.20 (Difference = 198.22)

Quick check – how do we interpret the coefficent estimate for dibepB?

Forensics

The coefficient above for dibepB is positive, and significantly so, suggesting an increase in risk of chd for “type B” relative to “type A”. This is not consistent with the coefficients for behave above, where both B1 and B2 showed a decrease risk relative to A1. What is going on?

xtabs(~ chd + behave, data = wcgs)
##      behave
## chd     A1   A2   B3   B4
##   no   234 1177 1155  331
##   yes   30  148   61   18
xtabs(~ chd + dibep, data = wcgs)
##      dibep
## chd      A    B
##   no  1486 1411
##   yes   79  178
1155 + 331; 1177+234
## [1] 1486
## [1] 1411

It looks like dibepB is actually dibepA, if dibepA is meant to be the aggregation of A1 and A2, which is what it says in the help file. The original paper is available on my web page. In their Table 3 (“Intake all ages”) we see 178 CHD in the Type A group and 79 CHD in the Type B group, as above once we switch the labels.

Example 10.18 from SM

Now I’ll have a look at the nodal involvment data from SM. The data as presented in the book looks like image But the dataset is recorded as binary:

data(nodal)
head(nodal); dim(nodal) # sum(nodal$m) = 53 not shown
##   m r aged stage grade xray acid
## 1 1 1    0     1     1    1    1
## 2 1 1    0     1     1    1    1
## 3 1 1    0     1     1    1    1
## 4 1 1    0     1     1    1    1
## 5 1 1    0     1     1    1    1
## 6 1 0    0     1     1    1    1
## [1] 53  7

We can group them using the tidyverse: (not shown: View(nodal2))

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
nodal2 <- nodal %>% 
  group_by(aged,stage,grade,xray,acid) %>% 
  summarise(total=sum(m), rtot = sum(r)) #Table 10.8
## `summarise()` has grouped output by 'aged', 'stage', 'grade', 'xray'. You can override using the `.groups` argument.
#View(nodal2)

First I’ll fit with the binary data

par(mfrow=c(2,2))
Ex1018bin.glm <- glm(r ~ aged+stage+grade+xray+acid, data = nodal, family = binomial)
plot(Ex1018bin.glm$linear.predictors, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")
plot(Ex1018bin.glm$fitted.values, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")
summary(Ex1018bin.glm)
## 
## Call:
## glm(formula = r ~ aged + stage + grade + xray + acid, family = binomial, 
##     data = nodal)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.332  -0.665  -0.300   0.639   2.150  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -3.079      0.987   -3.12   0.0018 **
## aged1         -0.292      0.754   -0.39   0.6988   
## stage1         1.373      0.784    1.75   0.0799 . 
## grade1         0.872      0.816    1.07   0.2850   
## xray1          1.801      0.810    2.22   0.0263 * 
## acid1          1.684      0.791    2.13   0.0334 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.252  on 52  degrees of freedom
## Residual deviance: 47.611  on 47  degrees of freedom
## AIC: 59.61
## 
## Number of Fisher Scoring iterations: 5

And now with the binomial version (grouped in covariate classes as above)

Ex1018binom.glm <- glm(cbind(rtot,total-rtot) ~ 
                    aged + stage + grade + xray + acid, data = nodal2, 
                  family = binomial)

summary(Ex1018binom.glm)
## 
## Call:
## glm(formula = cbind(rtot, total - rtot) ~ aged + stage + grade + 
##     xray + acid, family = binomial, data = nodal2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.499  -0.773  -0.127   0.800   1.435  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -3.079      0.987   -3.12   0.0018 **
## aged1         -0.292      0.754   -0.39   0.6988   
## stage1         1.373      0.784    1.75   0.0799 . 
## grade1         0.872      0.816    1.07   0.2850   
## xray1          1.801      0.810    2.22   0.0263 * 
## acid1          1.684      0.791    2.13   0.0334 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 40.710  on 22  degrees of freedom
## Residual deviance: 18.069  on 17  degrees of freedom
## AIC: 41.69
## 
## Number of Fisher Scoring iterations: 5
attributes(Ex1018binom.glm)
## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"          
## 
## $class
## [1] "glm" "lm"
par(mfrow=c(2,2))
plot(Ex1018binom.glm$linear.predictors,residuals(Ex1018binom.glm, type="deviance"), ylab = "deviance residuals", xlab = "linear predictors", main = "Binomial Fit")

plot(Ex1018binom.glm$fitted.values,residuals(Ex1018binom.glm, type="deviance"), ylab = "deviance residuals", xlab = "fitted probabilities", main = "Binomial Fit")

qqnorm(residuals(Ex1018binom.glm, type="deviance"), main = "deviance residuals")
qqnorm(residuals(Ex1018binom.glm, type="pearson"), main = "Pearson residuals")

plot(Ex1018binom.glm, main = "Binomial Fit, R default")

plot.glm.diag(Ex1018binom.glm, main = "Binomial Fit, Davison's SM")

?plot.glm.diag 

From the help file: “Makes plot of jackknife deviance residuals against linear predictor, normal scores plots of standardized deviance residuals, plot of approximate Cook statistics against leverage/(1-leverage), and case plot of Cook statistic.”

Model selection using AIC via the step function, leads to a simpler model with 3 covariates, acid, stage, xray.

step(Ex1018binom.glm)
## Start:  AIC=41.69
## cbind(rtot, total - rtot) ~ aged + stage + grade + xray + acid
## 
##         Df Deviance  AIC
## - aged   1     18.2 39.8
## - grade  1     19.2 40.8
## <none>         18.1 41.7
## - stage  1     21.3 42.9
## - acid   1     23.1 44.7
## - xray   1     23.4 45.0
## 
## Step:  AIC=39.84
## cbind(rtot, total - rtot) ~ stage + grade + xray + acid
## 
##         Df Deviance  AIC
## - grade  1     19.6 39.3
## <none>         18.2 39.8
## - stage  1     21.3 40.9
## - xray   1     23.6 43.2
## - acid   1     24.0 43.6
## 
## Step:  AIC=39.26
## cbind(rtot, total - rtot) ~ stage + xray + acid
## 
##         Df Deviance  AIC
## <none>         19.6 39.3
## - acid   1     24.9 42.5
## - stage  1     25.2 42.9
## - xray   1     26.4 44.0
## 
## Call:  glm(formula = cbind(rtot, total - rtot) ~ stage + xray + acid, 
##     family = binomial, data = nodal2)
## 
## Coefficients:
## (Intercept)       stage1        xray1        acid1  
##       -3.05         1.65         1.91         1.64  
## 
## Degrees of Freedom: 22 Total (i.e. Null);  19 Residual
## Null Deviance:       40.7 
## Residual Deviance: 19.6  AIC: 39.3

We can also compare the fits of the full model and the simpler one using the deviance.

Ex1018binom2.glm <- update(Ex1018binom.glm, .~. - aged - grade)
pchisq(deviance(Ex1018binom2.glm) - deviance(Ex1018binom.glm), df = 2, lower = F)
## [1] 0.4562