% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols \usetheme{AnnArbor} % CambridgeUS % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides % \usetheme{Berlin} % Displays sections on top % \usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{comment} % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Basic Regression Diagnostics\footnote{ This slide show is an open-source document. See last slide for copyright information.}} \subtitle{STA 441 Spring 2024} \date{} % To suppress date \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{This description of regression diagnostics is very basic} For a more complete and technical discussion, see \vspace{5mm} \begin{center} \href{http://www.utstat.toronto.edu/brunner/oldclass/302f20/lectures/302f20Diagnostics.pdf} {\tiny\texttt{http://www.utstat.toronto.edu/brunner/oldclass/302f20/lectures/302f20Diagnostics.pdf}} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Leverage (Influential $x$)} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Outliers in $x$ can be influential} %\framesubtitle{} \begin{center} \includegraphics[width=3.1in]{Leverage} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Or sometimes not} %\framesubtitle{} \begin{center} \includegraphics[width=3.1in]{NotSoMuch} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Hat Values: Also called ``Leverage" values} %\framesubtitle{} \begin{itemize} \item For the record, the ``hat matrix" is $\mathbf{H} = [h_{ij}] = \mathbf{X}(\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime $ \item Hat values, denoted $h_{ii}$, are the diagonal elements of this $n \times n$ matrix. \item Big hat values indicate (multivariate) outliers in the $x$ variables. \item So they could be influential. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Small hat values are good} \framesubtitle{Denote a residual by $e_i = y_i - \widehat{y}_i$} \begin{itemize} \item $0 \leq h_{ii} \leq 1$. \pause \item $\sum_{i=1}^n h_{ii} = p$ (the number of betas), so most hat values go to zero as $n \rightarrow \infty$. \pause \item $Var(e_i - \epsilon_i) = \sigma^2h_{ii}$. \pause For large samples, this variance is very small for most of the cases, and $e_i$ is probably close to $\epsilon_i$. \pause \item This is a justification for residual analysis. in which we check whether $e_i$ have the assumed properties of $\epsilon_i$. \end{itemize} \end{frame} \begin{frame} \frametitle{Robustness to normality} \framesubtitle{Robust means strong} \begin{itemize} \item If $\displaystyle \lim_{n \rightarrow\infty} \max_i h_{ii} = 0$, then the distribution of $\widehat{\boldsymbol{\beta}}$ approaches a multivariate normal $N_p(\boldsymbol{\beta},\sigma^2(\mathbf{X}^\prime \mathbf{X})^{-1})$, even if the distribution of the $\epsilon_i$ is not normal. \item[] \item In this case, tests and confidence intervals based on the normal distribution are roughly okay for large samples. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What is a ``small" hat value?} %\framesubtitle{} \begin{itemize} \item Rule of thumb: Worry about $h_{ii} > \frac{3p}{n}$ (or maybe $\frac{2p}{n}$). \item Another rule of thumb (for multivariate normality of $\widehat{\boldsymbol{\beta}}$) is worry about $h_{ii} > 0.2$. \item Or just look at a histogram of hat values. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What to do about large hat values?} %\framesubtitle{} \begin{itemize} \item Investigate. Maybe it's a data error. \item If the largest hat value is greater than 0.2, don't be so casual about normality. \item Do the cases with large hat values also have large residuals? \item Try temporarily setting the cases aside. Do the conclusions change? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Residuals} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Residuals: $e_i = y_i - \widehat{y}_i$ } %\framesubtitle{} \begin{eqnarray*} e_i & = & y_i - \widehat{y}_i \\ \pause y_i & = & \widehat{y}_i + e_i \\ \pause & = & b_0 + b_1 x_{i1} + b_2x_{i2} + \cdots + b_{p-1}x_{i,p-1} + e_i \\ \pause \end{eqnarray*} So we have {\Large \begin{eqnarray*} y_i & = & b_0 + b_1 x_{i1} + b_2x_{i2} + \cdots + b_{p-1}x_{i,p-1} + e_i \\ y_i & = & \beta_0 + \beta_1 x_{i1} + \beta_2x_{i2} + \cdots + \beta_{p-1}x_{i,p-1} + \epsilon_i \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Residuals: $e_i = y_i - \widehat{y}_i$} \pause %\framesubtitle{} \begin{itemize} \item Investigate points that are unusually far from the regression plane. \item Residual plots can reveal a lot. \begin{itemize} \item Against predicted $y$. %(Can show curvilinear trends, non-constant variance) \item Against explanatory variables not in the equation. %(Can suggest including variables) \item Against explanatory variables in the equation. %(Can show curvilinear trends) \item Against time. \item Look for serious departures from normality, outliers. \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables Not in the Equation} \pause %\framesubtitle{} \begin{center} \includegraphics[width=2.8in]{resplot1} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against $\widehat{y}$} \framesubtitle{$e_i$ and $\widehat{y}$ should be independent} \pause \begin{center} \includegraphics[width=3in]{resplot2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Plot versus $X_1$ showed nothing} \pause \begin{center} \includegraphics[width=2.75in]{resplot3} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Predicted $y$} \framesubtitle{Can show non-constant variance} \pause \begin{center} \includegraphics[width=3in]{resplot3b} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot \textbf{Absolute} Residuals Against Predicted $y$} \framesubtitle{Looking for non-constant variance} \begin{center} \includegraphics[width=3in]{resplot3c} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Can show non-constant variance} \pause \begin{center} \includegraphics[width=2.75in]{resplot4} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Time, if the data are time ordered} %\framesubtitle{} \begin{itemize} \item You really need to watch out for time ordered data. \item Standard regression methods may not be appropriate. \pause \item The problem is that $\mathbf{\epsilon}$ represents all other variables that are left out of the regression equation. \item Some of them could be time dependent. \item This would make the $\epsilon_i$ non-independent, possibly yielding misleading results. \item Check the Durbin-Watson statistic. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Outlier detection} %\framesubtitle{} \begin{itemize} \item Big residuals may be outliers. What's ``big?" \pause \item Standardized residuals \item Deleted residuals \item Various combinations \item Studentized deleted residuals. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Studentized deleted residuals} \framesubtitle{The idea} \begin{itemize} \item Calculate $\widehat{y}$ for each observation based on all the other observations, but not that one. Leave one out. \item Predict each observed $y$ based on all the others, and assess error of prediction (divided by standard error). \pause \item Big values suggest that the expected value of $y_i$ is not what it should be. \item Studentized deleted residuals have $t$ distributions if the model is correct. \item Treat as test statistics. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Studentized deleted residual} %\framesubtitle{} \begin{displaymath} t_i = \frac{y_i-\widehat{y}_{(i)}}{se_{(i)}} \pause = \frac{y_i-\mathbf{x}_i^\prime \widehat{\boldsymbol{\beta}}_{(i)}} {se_{(i)}} \pause \sim t(n-1-p) \end{displaymath} \pause \begin{itemize} \item If the model is correct, numerator has expected value zero. \item Use $t_i$ as a test statistic. If $H_0$ is rejected, investigate. \pause \item We are doing $n$ tests. \item If all null hypotheses are true (no outliers), there is still a good chance of rejection at least one $H_0$. \item Type I errors are very time consuming and disturbing. \item How about a Bonferroni correction? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Bonferroni Correction for Multiple Tests} %\framesubtitle{} \begin{itemize} \item Do the tests as usual, obtaining $n$ test statistics. \item Could multiply the $p$-values by $n$. \item Easier is to use the critical value $t_{\frac{\alpha}{2n}}(n-1-p)$. \pause \item Even for large $n$ it is not overly conservative. \item If you locate an outlier, \textbf{investigate}! \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Normality} %\framesubtitle{} \begin{itemize} \item Don't worry too much if maximum hat value is less than 0.2. \pause \item Instead of checking the residuals for normality, I like to check the Studentized deleted residuals. \item Their variances are all equal. \item And for a healthy sample size, $t$ is almost $z$. \pause \item Look at a histogram. There are also formal tests in \texttt{proc univariate normal}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \vspace{5mm} \begin{center} \href{http://www.utstat.toronto.edu/brunner/oldclass/441s24} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/441s24}} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document} \begin{comment} \end{comment} \begin{comment} # Example of influential x outlier rm(list=ls()) set.seed(9999) n = 50; beta0 = 10; beta1 = 1; sigma=1 x = runif(n,1,10); epsilon = rnorm(n,0,sigma) y = beta0 + beta1*x + epsilon pdf(file = 'Leverage.pdf') plot( x,y, xlim = c(0,20), ylim = c(7,30) ) pt = c(19,8); points(pt[1],pt[2],pch=16) x = c(x,pt[1]); y = c(y,pt[2]) mod = lm(y~x) b = coefficients(mod); b lines(c(0,20),c(b[1],b[1]+19*b[2])) dev.off() # Example of non-influential x outlier x = x[-(n+1)]; y = y[-(n+1)] pdf(file = 'NotSoMuch.pdf') plot( x,y, xlim = c(0,20), ylim = c(7,30) ) pt = c(19,beta0+beta1*19); points(pt[1],pt[2],pch=16) x = c(x,pt[1]); y = c(y,pt[2]) mod = lm(y~x) b = coefficients(mod); b lines(c(0,20),c(b[1],b[1]+19*b[2])) dev.off() Plot Residuals Against Independent Variables Not in the Equation E[Y|X1, X2] = beta0 + beta1 X1 + beta2 X2 Fit model with just X1 Say beta2 < 0 rm(list=ls()) rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) sprime("Sigma must be square.") if(dsig[1] != kk) sprime("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma,symmetric=T) sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) rmvn <- t(PP%*%sqrl%*%ZZ+mu) rmvn }# End of function rmvn set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = -1; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon cor(cbind(X1,X2,Y)) mod1 = lm(Y~X1); residual = mod1$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with just ',X[1]))) title(expression(paste('True model has both ', X[1], ' and ',X[2]))) ,,,,,,,,,, set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals; Yhat = mod3$fitted.values plot(Yhat,residual, ylab="Residual") title(expression(paste('Suspect Curvilinear Relationship with one or more X variables' ))) ,,,,,,,,,, Plot Residuals Against Independent Variables in the Equation Detect Curvilinear Relationship set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with E(Y|',bold(X), ') = ', beta[0]+beta[1]*X[1]+beta[2]*X[2] ))) title(expression(paste('Detect Curvilinear Relationship with ',X[2] ))) ,,,,,,,, Plot Residuals Against Independent Variables in the Equation: Detect Variance Increasing with X1 set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = 1 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] sig = seq(from=1,to=20,length.out=n) sig = sig[rank(X1)] # Sig an increasing function of X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon mod2 = lm(Y~X1+X2); Residual = mod2$residuals #$ # Try against y-hat yhat = fitted(mod2) plot(yhat,Residual,xlab='Predicted y', main = 'Detect non-constant variance') # Absolute value of residual plot(yhat,abs(Residual),xlab='Predicted y',ylab = 'Absolute residual') title('Non-constant variance is easier to see') summary(lm(abs(Residual) ~ yhat)) # Try against X1 plot(X1,Residual,xlab=expression(X[1])) title(expression(paste('Detect Variance Increasing with ',X[1] ))) \end{comment}