data(wcgs, package = "faraway")
head(wcgs) #not run: str(wcgs); plot(wcgs); help(wcgs)
## age height weight sdp dbp chol behave cigs dibep chd typechd timechd
## 2001 49 73 150 110 76 225 A2 25 B no none 1664
## 2002 42 70 160 154 84 177 A2 20 B no none 3071
## 2003 42 69 160 110 78 181 B3 0 A no none 3071
## 2004 41 68 152 124 78 132 B4 20 A no none 3064
## 2005 59 70 150 144 86 255 B3 20 A yes infdeath 1885
## 2006 44 72 204 150 90 182 B4 0 A no none 3102
## arcus
## 2001 absent
## 2002 present
## 2003 absent
## 2004 absent
## 2005 present
## 2006 absent
sum(is.na(wcgs)) # there are a few missing values; much further sleuthing finds arcus and chol have some missing values
## [1] 14
Fitting a binary model:
glm(chd ~ ., family = binomial, data = wcgs)
## Warning: glm.fit: algorithm did not converge
##
## Call: glm(formula = chd ~ ., family = binomial, data = wcgs)
##
## Coefficients:
## (Intercept) age height weight
## 2.66e+01 -1.58e-14 -1.07e-13 1.02e-14
## sdp dbp chol behaveA2
## 1.97e-14 -2.10e-14 8.99e-16 -4.35e-13
## behaveB3 behaveB4 cigs dibepB
## -1.82e-13 -1.21e-13 -6.69e-15 NA
## typechdinfdeath typechdnone typechdsilent timechd
## 1.36e-08 -5.31e+01 -1.80e-07 9.53e-16
## arcuspresent
## 3.48e-13
##
## Degrees of Freedom: 3139 Total (i.e. Null); 3124 Residual
## (14 observations deleted due to missingness)
## Null Deviance: 1770
## Residual Deviance: 0.0000000182 AIC: 32
## typechd and timechd confounded with chd; behave and dibepB are the same variable recorded differently
heartmod <- glm(chd ~ age + height + weight + sdp + dbp + chol + behave + cigs, family = binomial, data = wcgs)
summary(heartmod)
##
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol +
## behave + cigs, family = binomial, data = wcgs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.419 -0.435 -0.316 -0.222 2.850
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.989034 2.336470 -5.56 0.00000002709290 ***
## age 0.065068 0.012125 5.37 0.00000008025581 ***
## height 0.015824 0.033141 0.48 0.6330
## weight 0.007874 0.003881 2.03 0.0425 *
## sdp 0.017536 0.006395 2.74 0.0061 **
## dbp 0.000121 0.010834 0.01 0.9911
## chol 0.011076 0.001522 7.28 0.00000000000034 ***
## behaveA2 0.088266 0.222589 0.40 0.6917
## behaveB3 -0.602705 0.244216 -2.47 0.0136 *
## behaveB4 -0.499214 0.320978 -1.56 0.1199
## cigs 0.020998 0.004280 4.91 0.00000093160928 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1779.2 on 3141 degrees of freedom
## Residual deviance: 1580.4 on 3131 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 1602
##
## Number of Fisher Scoring iterations: 6
heartmod2 <- update(heartmod, . ~ . - behave)
summary(heartmod2)
##
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol +
## cigs, family = binomial, data = wcgs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.346 -0.440 -0.324 -0.233 2.826
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.84290 2.31823 -5.97 0.00000000235323 ***
## age 0.06896 0.01204 5.73 0.00000001030382 ***
## height 0.01856 0.03310 0.56 0.5749
## weight 0.00827 0.00385 2.15 0.0316 *
## sdp 0.01806 0.00633 2.85 0.0043 **
## dbp 0.00126 0.01077 0.12 0.9067
## chol 0.01118 0.00151 7.42 0.00000000000012 ***
## cigs 0.02299 0.00426 5.40 0.00000006637269 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1779.2 on 3141 degrees of freedom
## Residual deviance: 1602.0 on 3134 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 1618
##
## Number of Fisher Scoring iterations: 6
anova(heartmod2, heartmod, test = "LRT") #default for test is NULL
## Analysis of Deviance Table
##
## Model 1: chd ~ age + height + weight + sdp + dbp + chol + cigs
## Model 2: chd ~ age + height + weight + sdp + dbp + chol + behave + cigs
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3134 1602
## 2 3131 1580 3 21.5 0.000082 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a significant effect of behaviour; a check using levels(wcgs$behave)
shows that it has levels A1, A2, B1, B2. The variable dibep
collapses these to A and B.
heartmod3 <- update(heartmod, .~ . - behave + dibep )
summary(heartmod3)
##
## Call:
## glm(formula = chd ~ age + height + weight + sdp + dbp + chol +
## cigs + dibep, family = binomial, data = wcgs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.410 -0.435 -0.315 -0.223 2.839
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.55457 2.32014 -5.84 0.00000000515355 ***
## age 0.06477 0.01210 5.35 0.00000008610612 ***
## height 0.01600 0.03312 0.48 0.6289
## weight 0.00782 0.00388 2.02 0.0437 *
## sdp 0.01772 0.00637 2.78 0.0054 **
## dbp -0.00015 0.01082 -0.01 0.9890
## chol 0.01106 0.00152 7.27 0.00000000000036 ***
## cigs 0.02092 0.00427 4.90 0.00000098078442 ***
## dibepB 0.65338 0.14522 4.50 0.00000681630545 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1779.2 on 3141 degrees of freedom
## Residual deviance: 1580.7 on 3133 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 1599
##
## Number of Fisher Scoring iterations: 6
Quick check – how do we interpret the coefficent estimate for dibepB
?
The coefficient above for dibepB
is positive, and significantly so, suggesting an increase in risk of chd
for “type B” relative to “type A”. This is not consistent with the coefficients for behave
above, where both B1 and B2 showed a decrease risk relative to A1. What is going on?
xtabs(~ chd + behave, data = wcgs)
## behave
## chd A1 A2 B3 B4
## no 234 1177 1155 331
## yes 30 148 61 18
xtabs(~ chd + dibep, data = wcgs)
## dibep
## chd A B
## no 1486 1411
## yes 79 178
1155 + 331; 1177+234
## [1] 1486
## [1] 1411
It looks like dibepB
is actually dibepA
, if dibepA
is meant to be the aggregation of A1 and A2, which is what is says in the help file. The original paper is available on my web page. In their Table 3 (“Intake all ages”) we see 178 CHD in the Type A group and 79 CHD in the Type B group, as above once we switch the labels.
Now I’ll have a look at the nodal involvment data from SM. The data as presented in the book looks like But the dataset is recorded as binary:
data(nodal)
head(nodal); dim(nodal) # sum(nodal$m) = 53 not shown
## m r aged stage grade xray acid
## 1 1 1 0 1 1 1 1
## 2 1 1 0 1 1 1 1
## 3 1 1 0 1 1 1 1
## 4 1 1 0 1 1 1 1
## 5 1 1 0 1 1 1 1
## 6 1 0 0 1 1 1 1
## [1] 53 7
We can group them using the tidyverse
:
nodal2 <- nodal %>%
group_by(aged,stage,grade,xray,acid) %>%
summarise(total=sum(m), rtot = sum(r)) #Table 10.8
## `summarise()` has grouped output by 'aged', 'stage', 'grade', 'xray'. You can override using the `.groups` argument.
head(nodal2)
## # A tibble: 6 × 7
## # Groups: aged, stage, grade, xray [3]
## aged stage grade xray acid total rtot
## <fct> <fct> <fct> <fct> <fct> <dbl> <dbl>
## 1 0 0 0 0 0 4 0
## 2 0 0 0 0 1 6 1
## 3 0 0 0 1 0 1 0
## 4 0 0 0 1 1 1 0
## 5 0 0 1 0 0 2 1
## 6 0 0 1 0 1 1 1
First I’ll fit with the binary data
Ex1018bin.glm <- glm(r ~ aged+stage+grade+xray+acid, data = nodal, family = binomial)
plot(Ex1018bin.glm$linear.predictors, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")
plot(Ex1018bin.glm$fitted.values, residuals(Ex1018bin.glm, type="deviance"), main = "Binary Fit")
summary(Ex1018bin.glm)
##
## Call:
## glm(formula = r ~ aged + stage + grade + xray + acid, family = binomial,
## data = nodal)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.332 -0.665 -0.300 0.639 2.150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.079 0.987 -3.12 0.0018 **
## aged1 -0.292 0.754 -0.39 0.6988
## stage1 1.373 0.784 1.75 0.0799 .
## grade1 0.872 0.816 1.07 0.2850
## xray1 1.801 0.810 2.22 0.0263 *
## acid1 1.684 0.791 2.13 0.0334 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70.252 on 52 degrees of freedom
## Residual deviance: 47.611 on 47 degrees of freedom
## AIC: 59.61
##
## Number of Fisher Scoring iterations: 5
And now with the binomial version (grouped in covariate classes as above)
Ex1018binom.glm <- glm(cbind(rtot,total-rtot) ~
aged + stage + grade + xray + acid, data = nodal2,
family = binomial)
summary(Ex1018binom.glm)
##
## Call:
## glm(formula = cbind(rtot, total - rtot) ~ aged + stage + grade +
## xray + acid, family = binomial, data = nodal2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.499 -0.773 -0.127 0.800 1.435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.079 0.987 -3.12 0.0018 **
## aged1 -0.292 0.754 -0.39 0.6988
## stage1 1.373 0.784 1.75 0.0799 .
## grade1 0.872 0.816 1.07 0.2850
## xray1 1.801 0.810 2.22 0.0263 *
## acid1 1.684 0.791 2.13 0.0334 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 40.710 on 22 degrees of freedom
## Residual deviance: 18.069 on 17 degrees of freedom
## AIC: 41.69
##
## Number of Fisher Scoring iterations: 5
attributes(Ex1018binom.glm)
## $names
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
##
## $class
## [1] "glm" "lm"
plot(Ex1018binom.glm$linear.predictors,residuals(Ex1018binom.glm, type="deviance"), main = "Binomial Fit")
plot(Ex1018binom.glm$fitted.values,residuals(Ex1018binom.glm, type="deviance"), main = "Binomial Fit")
qqnorm(residuals(Ex1018binom.glm, type="deviance"))
qqnorm(residuals(Ex1018binom.glm, type="pearson"))
plot(Ex1018binom.glm, main = "Binomial Fit, R default")
plot.glm.diag(Ex1018binom.glm, main = "Binomial Fit, Davison's SM")
?plot.glm.diag
From the help file: “Makes plot of jackknife deviance residuals against linear predictor, normal scores plots of standardized deviance residuals, plot of approximate Cook statistics against leverage/(1-leverage), and case plot of Cook statistic.”
This example in Chapter 3 of FELM is used to introduce Poisson regression. Faraway first considers ordinary linear regression, and notes that the residuals don’t seem to have constant variance. Back in the day we’d take logs or square root of the response and then fit linear regression again, but a more modern approach is to fit a Poisson glm.
The residual deviance in a Poisson model is a test of fit of the model, as it is with a Binomial. The second fit below allows an inflation factor on the variance, while still using the simplicity of the Poisson for fitting.
data(gala)
head(gala)
## Species Endemics Area Elevation Nearest Scruz Adjacent
## Baltra 58 23 25.09 346 0.6 0.6 1.84
## Bartolome 31 21 1.24 109 0.6 26.3 572.33
## Caldwell 3 3 0.21 114 2.8 58.7 0.78
## Champion 25 9 0.10 46 1.9 47.4 0.18
## Coamano 2 1 0.05 77 1.9 1.9 903.82
## Daphne.Major 18 11 0.34 119 8.0 8.0 1.84
help(gala)
gal.glm <- glm(Species ~ . - Endemics,
family = poisson,
data = gala)
summary(gal.glm)
##
## Call:
## glm(formula = Species ~ . - Endemics, family = poisson, data = gala)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.275 -4.497 -0.944 1.917 10.185
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.1548079 0.0517495 60.96 < 2e-16 ***
## Area -0.0005799 0.0000263 -22.07 < 2e-16 ***
## Elevation 0.0035406 0.0000874 40.51 < 2e-16 ***
## Nearest 0.0088256 0.0018213 4.85 0.0000013 ***
## Scruz -0.0057094 0.0006256 -9.13 < 2e-16 ***
## Adjacent -0.0006630 0.0000293 -22.61 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 3510.73 on 29 degrees of freedom
## Residual deviance: 716.85 on 24 degrees of freedom
## AIC: 889.7
##
## Number of Fisher Scoring iterations: 5
gal.quasi <- glm(Species ~ . - Endemics,
family = quasipoisson,
data = gala)
summary(gal.quasi)
##
## Call:
## glm(formula = Species ~ . - Endemics, family = quasipoisson,
## data = gala)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.275 -4.497 -0.944 1.917 10.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.154808 0.291590 10.82 0.0000000001 ***
## Area -0.000580 0.000148 -3.92 0.00065 ***
## Elevation 0.003541 0.000493 7.19 0.0000001983 ***
## Nearest 0.008826 0.010262 0.86 0.39829
## Scruz -0.005709 0.003525 -1.62 0.11838
## Adjacent -0.000663 0.000165 -4.01 0.00051 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 31.75)
##
## Null deviance: 3510.73 on 29 degrees of freedom
## Residual deviance: 716.85 on 24 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
plot(gal.quasi$linear.predictors, gal.quasi$residuals)
The canonical link for the Poisson is the log link. Identity links are sometimes used as well; see the cloth
data analysis in SM Example 10.28.