Binary data

data(wcgs, package = "faraway")
head(wcgs) #not run: str(wcgs); plot(wcgs); help(wcgs)
##      age height weight sdp dbp chol behave cigs dibep chd  typechd timechd
## 2001  49     73    150 110  76  225     A2   25     B  no     none    1664
## 2002  42     70    160 154  84  177     A2   20     B  no     none    3071
## 2003  42     69    160 110  78  181     B3    0     A  no     none    3071
## 2004  41     68    152 124  78  132     B4   20     A  no     none    3064
## 2005  59     70    150 144  86  255     B3   20     A yes infdeath    1885
## 2006  44     72    204 150  90  182     B4    0     A  no     none    3102
##        arcus
## 2001  absent
## 2002 present
## 2003  absent
## 2004  absent
## 2005 present
## 2006  absent
sum(is.na(wcgs)) # there are a few missing values; much further sleuthing finds arcus and chol have some missing values
## [1] 14

Fitting a binary model:

glm(chd ~ ., family = binomial, data = wcgs)
## Warning: glm.fit: algorithm did not converge
## 
## Call:  glm(formula = chd ~ ., family = binomial, data = wcgs)
## 
## Coefficients:
##     (Intercept)              age           height           weight  
##        2.66e+01        -1.58e-14        -1.07e-13         1.02e-14  
##             sdp              dbp             chol         behaveA2  
##        1.97e-14        -2.10e-14         8.99e-16        -4.35e-13  
##        behaveB3         behaveB4             cigs           dibepB  
##       -1.82e-13        -1.21e-13        -6.69e-15               NA  
## typechdinfdeath      typechdnone    typechdsilent          timechd  
##        1.36e-08        -5.31e+01        -1.80e-07         9.53e-16  
##    arcuspresent  
##        3.48e-13  
## 
## Degrees of Freedom: 3139 Total (i.e. Null);  3124 Residual
##   (14 observations deleted due to missingness)
## Null Deviance:       1770 
## Residual Deviance: 0.0000000182  AIC: 32
## typechd and timechd confounded with chd; behave and dibepB are the same variable recorded differently
heartmod <- glm(chd ~ age + height + weight + sdp + dbp + chol + behave + cigs, family = binomial, data = wcgs)
summary(heartmod)
## 
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol + 
##     behave + cigs, family = binomial, data = wcgs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.419  -0.435  -0.316  -0.222   2.850  
## 
## Coefficients:
##               Estimate Std. Error z value         Pr(>|z|)    
## (Intercept) -12.989034   2.336470   -5.56 0.00000002709290 ***
## age           0.065068   0.012125    5.37 0.00000008025581 ***
## height        0.015824   0.033141    0.48           0.6330    
## weight        0.007874   0.003881    2.03           0.0425 *  
## sdp           0.017536   0.006395    2.74           0.0061 ** 
## dbp           0.000121   0.010834    0.01           0.9911    
## chol          0.011076   0.001522    7.28 0.00000000000034 ***
## behaveA2      0.088266   0.222589    0.40           0.6917    
## behaveB3     -0.602705   0.244216   -2.47           0.0136 *  
## behaveB4     -0.499214   0.320978   -1.56           0.1199    
## cigs          0.020998   0.004280    4.91 0.00000093160928 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1779.2  on 3141  degrees of freedom
## Residual deviance: 1580.4  on 3131  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 1602
## 
## Number of Fisher Scoring iterations: 6
heartmod2 <- update(heartmod, . ~ . - behave)
summary(heartmod2)
## 
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol + 
##     cigs, family = binomial, data = wcgs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.346  -0.440  -0.324  -0.233   2.826  
## 
## Coefficients:
##              Estimate Std. Error z value         Pr(>|z|)    
## (Intercept) -13.84290    2.31823   -5.97 0.00000000235323 ***
## age           0.06896    0.01204    5.73 0.00000001030382 ***
## height        0.01856    0.03310    0.56           0.5749    
## weight        0.00827    0.00385    2.15           0.0316 *  
## sdp           0.01806    0.00633    2.85           0.0043 ** 
## dbp           0.00126    0.01077    0.12           0.9067    
## chol          0.01118    0.00151    7.42 0.00000000000012 ***
## cigs          0.02299    0.00426    5.40 0.00000006637269 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1779.2  on 3141  degrees of freedom
## Residual deviance: 1602.0  on 3134  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 1618
## 
## Number of Fisher Scoring iterations: 6
anova(heartmod2, heartmod, test = "LRT") #default for test is NULL
## Analysis of Deviance Table
## 
## Model 1: chd ~ age + height + weight + sdp + dbp + chol + cigs
## Model 2: chd ~ age + height + weight + sdp + dbp + chol + behave + cigs
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1      3134       1602                         
## 2      3131       1580  3     21.5 0.000082 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a significant effect of behaviour; a check using levels(wcgs$behave) shows that it has levels A1, A2, B1, B2. The variable dibep collapses these to A and B.

heartmod3 <- update(heartmod, .~ . - behave + dibep )
summary(heartmod3)
## 
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol + 
##     cigs + dibep, family = binomial, data = wcgs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.410  -0.435  -0.315  -0.223   2.839  
## 
## Coefficients:
##              Estimate Std. Error z value         Pr(>|z|)    
## (Intercept) -13.55457    2.32014   -5.84 0.00000000515355 ***
## age           0.06477    0.01210    5.35 0.00000008610612 ***
## height        0.01600    0.03312    0.48           0.6289    
## weight        0.00782    0.00388    2.02           0.0437 *  
## sdp           0.01772    0.00637    2.78           0.0054 ** 
## dbp          -0.00015    0.01082   -0.01           0.9890    
## chol          0.01106    0.00152    7.27 0.00000000000036 ***
## cigs          0.02092    0.00427    4.90 0.00000098078442 ***
## dibepB        0.65338    0.14522    4.50 0.00000681630545 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1779.2  on 3141  degrees of freedom
## Residual deviance: 1580.7  on 3133  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 1599
## 
## Number of Fisher Scoring iterations: 6

Quick check – how do we interpret the coefficent estimate for dibepB?

Forensics

The coefficient above for dibepB is positive, and significantly so, suggesting an increase in risk of chd for “type B” relative to “type A”. This is not consistent with the coefficients for behave above, where both B1 and B2 showed a decrease risk relative to A1. What is going on?

xtabs(~ chd + behave, data = wcgs)
##      behave
## chd     A1   A2   B3   B4
##   no   234 1177 1155  331
##   yes   30  148   61   18
xtabs(~ chd + dibep, data = wcgs)
##      dibep
## chd      A    B
##   no  1486 1411
##   yes   79  178
1155 + 331; 1177+234
## [1] 1486
## [1] 1411

It looks like dibepB is actually dibepA, if dibepA is meant to be the aggregation of A1 and A2, which is what is says in the help file. The original paper is available on my web page. In their Table 3 (“Intake all ages”) we see 178 CHD in the Type A group and 79 CHD in the Type B group, as above once we switch the labels.

Example 10.18 from SM

Now I’ll have a look at the nodal involvment data from SM. The data as presented in the book looks like image But the dataset is recorded as binary:

data(nodal)
head(nodal); dim(nodal) # sum(nodal$m) = 53 not shown
##   m r aged stage grade xray acid
## 1 1 1    0     1     1    1    1
## 2 1 1    0     1     1    1    1
## 3 1 1    0     1     1    1    1
## 4 1 1    0     1     1    1    1
## 5 1 1    0     1     1    1    1
## 6 1 0    0     1     1    1    1
## [1] 53  7

We can group them using the tidyverse:

nodal2 <- nodal %>% 
  group_by(aged,stage,grade,xray,acid) %>% 
  summarise(total=sum(m), rtot = sum(r)) #Table 10.8
## `summarise()` has grouped output by 'aged', 'stage', 'grade', 'xray'. You can override using the `.groups` argument.
head(nodal2)
## # A tibble: 6 × 7
## # Groups:   aged, stage, grade, xray [3]
##   aged  stage grade xray  acid  total  rtot
##   <fct> <fct> <fct> <fct> <fct> <dbl> <dbl>
## 1 0     0     0     0     0         4     0
## 2 0     0     0     0     1         6     1
## 3 0     0     0     1     0         1     0
## 4 0     0     0     1     1         1     0
## 5 0     0     1     0     0         2     1
## 6 0     0     1     0     1         1     1

First I’ll fit with the binary data

Ex1018bin.glm <- glm(r ~ aged+stage+grade+xray+acid, data = nodal, family = binomial)
plot(Ex1018bin.glm$linear.predictors, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")

plot(Ex1018bin.glm$fitted.values, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")

summary(Ex1018bin.glm)
## 
## Call:
## glm(formula = r ~ aged + stage + grade + xray + acid, family = binomial, 
##     data = nodal)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.332  -0.665  -0.300   0.639   2.150  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -3.079      0.987   -3.12   0.0018 **
## aged1         -0.292      0.754   -0.39   0.6988   
## stage1         1.373      0.784    1.75   0.0799 . 
## grade1         0.872      0.816    1.07   0.2850   
## xray1          1.801      0.810    2.22   0.0263 * 
## acid1          1.684      0.791    2.13   0.0334 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.252  on 52  degrees of freedom
## Residual deviance: 47.611  on 47  degrees of freedom
## AIC: 59.61
## 
## Number of Fisher Scoring iterations: 5

And now with the binomial version (grouped in covariate classes as above)

Ex1018binom.glm <- glm(cbind(rtot,total-rtot) ~ 
                    aged + stage + grade + xray + acid, data = nodal2, 
                  family = binomial)

summary(Ex1018binom.glm)
## 
## Call:
## glm(formula = cbind(rtot, total - rtot) ~ aged + stage + grade + 
##     xray + acid, family = binomial, data = nodal2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.499  -0.773  -0.127   0.800   1.435  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -3.079      0.987   -3.12   0.0018 **
## aged1         -0.292      0.754   -0.39   0.6988   
## stage1         1.373      0.784    1.75   0.0799 . 
## grade1         0.872      0.816    1.07   0.2850   
## xray1          1.801      0.810    2.22   0.0263 * 
## acid1          1.684      0.791    2.13   0.0334 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 40.710  on 22  degrees of freedom
## Residual deviance: 18.069  on 17  degrees of freedom
## AIC: 41.69
## 
## Number of Fisher Scoring iterations: 5
attributes(Ex1018binom.glm)
## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"          
## 
## $class
## [1] "glm" "lm"
plot(Ex1018binom.glm$linear.predictors,residuals(Ex1018binom.glm, type="deviance"), main = "Binomial Fit")

plot(Ex1018binom.glm$fitted.values,residuals(Ex1018binom.glm, type="deviance"), main = "Binomial Fit")

qqnorm(residuals(Ex1018binom.glm, type="deviance"))

qqnorm(residuals(Ex1018binom.glm, type="pearson"))

plot(Ex1018binom.glm, main = "Binomial Fit, R default")

plot.glm.diag(Ex1018binom.glm, main = "Binomial Fit, Davison's SM")

?plot.glm.diag 

From the help file: “Makes plot of jackknife deviance residuals against linear predictor, normal scores plots of standardized deviance residuals, plot of approximate Cook statistics against leverage/(1-leverage), and case plot of Cook statistic.”

Poisson regression

This example in Chapter 3 of FELM is used to introduce Poisson regression. Faraway first considers ordinary linear regression, and notes that the residuals don’t seem to have constant variance. Back in the day we’d take logs or square root of the response and then fit linear regression again, but a more modern approach is to fit a Poisson glm.

The residual deviance in a Poisson model is a test of fit of the model, as it is with a Binomial. The second fit below allows an inflation factor on the variance, while still using the simplicity of the Poisson for fitting.

data(gala)
head(gala)
##              Species Endemics  Area Elevation Nearest Scruz Adjacent
## Baltra            58       23 25.09       346     0.6   0.6     1.84
## Bartolome         31       21  1.24       109     0.6  26.3   572.33
## Caldwell           3        3  0.21       114     2.8  58.7     0.78
## Champion          25        9  0.10        46     1.9  47.4     0.18
## Coamano            2        1  0.05        77     1.9   1.9   903.82
## Daphne.Major      18       11  0.34       119     8.0   8.0     1.84
help(gala)

gal.glm <- glm(Species ~ . - Endemics,
             family = poisson,
             data = gala)

summary(gal.glm)
## 
## Call:
## glm(formula = Species ~ . - Endemics, family = poisson, data = gala)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -8.275  -4.497  -0.944   1.917  10.185  
## 
## Coefficients:
##               Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  3.1548079  0.0517495   60.96   < 2e-16 ***
## Area        -0.0005799  0.0000263  -22.07   < 2e-16 ***
## Elevation    0.0035406  0.0000874   40.51   < 2e-16 ***
## Nearest      0.0088256  0.0018213    4.85 0.0000013 ***
## Scruz       -0.0057094  0.0006256   -9.13   < 2e-16 ***
## Adjacent    -0.0006630  0.0000293  -22.61   < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.7
## 
## Number of Fisher Scoring iterations: 5
gal.quasi <- glm(Species ~ . - Endemics,
                 family = quasipoisson,
                 data = gala)
summary(gal.quasi)
## 
## Call:
## glm(formula = Species ~ . - Endemics, family = quasipoisson, 
##     data = gala)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -8.275  -4.497  -0.944   1.917  10.185  
## 
## Coefficients:
##              Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)  3.154808   0.291590   10.82 0.0000000001 ***
## Area        -0.000580   0.000148   -3.92      0.00065 ***
## Elevation    0.003541   0.000493    7.19 0.0000001983 ***
## Nearest      0.008826   0.010262    0.86      0.39829    
## Scruz       -0.005709   0.003525   -1.62      0.11838    
## Adjacent    -0.000663   0.000165   -4.01      0.00051 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 31.75)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
plot(gal.quasi$linear.predictors, gal.quasi$residuals)

The canonical link for the Poisson is the log link. Identity links are sometimes used as well; see the cloth data analysis in SM Example 10.28.