% Residuals for STA302 % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on prime \usetheme{Frankfurt} % Displays section titles on prime: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Analysis of Residuals\footnote{See last slide for copyright information.}} \subtitle{STA302 Fall 2016} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Residual means left over: $e_i = y_i - \widehat{y}_i $} % \framesubtitle{} \begin{itemize} \item Vertical distance of $y_i$ from the regression hyper-plane \item An error of ``prediction." \pause \item Big residuals merit further investigation. \pause \item Big compared to what? \pause \item They are normally distributed. \item Consider standardizing. \pause \item Maybe detect outliers. \pause \item Plots can also be informative. \end{itemize} \end{frame} \begin{frame} \frametitle{Residuals are like estimated error terms} \begin{displaymath} e_i = y_i - \widehat{y}_i ~~~ \Leftrightarrow ~~~ y_i = \widehat{y}_i + e_i \end{displaymath} \pause \vspace{5mm} \begin{eqnarray*} y_i & = & \widehat{y}_i + e_i \\ \pause & = & b_0 + b_1 x_{i,1} + \cdots + b_{k} x_{i,k} + e_i \\ \pause & = & \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{k} x_{i,k} + \epsilon_i \\ \end{eqnarray*} \pause \vspace{5mm} Normal distribution of $\epsilon_i$ implies normal distribution of $e_i$, \pause but the $e_i$ are not independent, and they do not have equal variance. % They get independent as n -> infinity \end{frame} \begin{frame} \frametitle{Data = Fit + Residual} {\LARGE \begin{displaymath} y_i = \widehat{y}_i + e_i \end{displaymath} } \end{frame} \begin{frame} \frametitle{Plotting residuals can be helpful} \pause % \framesubtitle{} \begin{itemize} \item Against predicted $y$. \pause \item Against explanatory variables not in the equation. \pause \item Against explanatory variables in the equation. \pause \item Look for serious departures from normality. \end{itemize} \end{frame} \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables Not in the Equation} \pause %\framesubtitle{} \begin{center} \includegraphics[width=3in]{resplot1} \end{center} \end{frame} \begin{frame} \frametitle{Plot Residuals Against $\widehat{y}$} \pause %\framesubtitle{} \begin{center} \includegraphics[width=3in]{resplot2} \end{center} \end{frame} \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Plot versus $X_1$ showed nothing} \pause \begin{center} \includegraphics[width=2.75in]{resplot3} \end{center} \end{frame} \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Can show non-constant variance} \pause \begin{center} \includegraphics[width=2.75in]{resplot4} \end{center} \end{frame} \begin{frame} \frametitle{Outlier detection} %\framesubtitle{} \begin{itemize} \item Big residuals may be outliers. What's ``big?" \pause \item Consider standardizing. \pause \item Could divide by square root of sample variance of $e_1, \ldots, e_n$. \pause \item Semi-Studentized: Estimate $Var(e_i)$ and divide by square root of that: $\frac{e_i}{\sqrt{s^2\,(1-h_{i,i})}}$ \pause \item In R, this is produced with \texttt{rstandard}. % rstandard \end{itemize} \end{frame} \begin{frame} \frametitle{Studentized deleted residuals} \framesubtitle{The idea} \begin{itemize} \item An outlier will make $s^2$ big. \pause \item In that case, the standardized (Semi-Studentized) residual will be too small -- less noticeable. \pause \item So calculate $\widehat{y}$ for each observation based on all the other observations, but not that one. \pause \item Predict each observed $y$ based on all the others, and assess error of prediction (divided by standard error). \pause \item Big values suggest that the expected value of $y_i$ is not what it should be. \end{itemize} \end{frame} \begin{frame} \frametitle{Apply prediction interval technology} \pause %\framesubtitle{} \begin{displaymath} t = \frac{y_{0}-\mathbf{x}_{0}^\prime \mathbf{b}} {\sqrt{s^2(1+\mathbf{x}_{0}^\prime (X^\prime X)^{-1}\mathbf{x}_{0})}} \sim t(n-k-1) \end{displaymath} \pause \begin{itemize} \item Note that $y_i$ is now being called $y_{0}$. \pause \item If the ``prediction" is too far off there is trouble. \pause \item Use $t$ as a test statistic. \pause \item Need to change the notation. \end{itemize} \end{frame} \begin{frame} \frametitle{Studentized deleted residual} %\framesubtitle{} \begin{displaymath} e^*_i = \frac{y_i-\mathbf{x}_i^\prime \mathbf{b}_{(i)}} {\sqrt{s^2_{(i)}(1+\mathbf{x}_i^\prime (X_{(i)}^\prime X_{(i)})^{-1}\mathbf{x}_i)}} \sim t(n-k-2) \end{displaymath} \pause \begin{itemize} \item In R, this is produced with \texttt{rstudent}. \pause \item There is a more efficient formula. \pause \item Use $t_i$ as a test statistic of $H_0: E(y_i) = \mathbf{x}_i^\prime \boldsymbol{\beta}$. \pause \item If $H_0$ is rejected, investigate. \pause \item We are doing $n$ tests. \pause \item Type I errors are very time consuming and disturbing. \pause \item If independent, probability of no false positives would be \pause $(1-\alpha)^n \rightarrow 0$. \pause \item But they are not independent. \pause \item How about a Bonferroni correction? \end{itemize} \end{frame} \begin{frame} \frametitle{Bonferroni Correction for Multiple Tests} %\framesubtitle{} \begin{itemize} \item The curse of a thousand $t$-tests. \pause \item If the null hypotheses of a collection of tests are all true, \pause hold the probability of rejecting one or more to less than $\alpha=0.05$. \pause \item Based on Bonferroni's inequality: \pause \begin{displaymath} Pr\left\{ \cup_{j=1}^r A_j \right\} \leq \sum_{j=1}^r Pr\{A_j\} \end{displaymath} \pause \item Applies to any collection of $r$ tests. \pause \item Assume all $r$ null hypotheses are true. \pause \item Event $A_j$ is that null hypothesis $j$ is rejected. \pause \item Do the tests as usual, obtaining $r$ test statistics. \pause \item For each test, use the significance level $\alpha/r$ instead of $\alpha$. \end{itemize} \end{frame} \begin{frame} \frametitle{Use the significance level $\alpha/r$ instead of $\alpha$} \framesubtitle{Bonferroni Correction for $r$ Tests} \pause Assuming all $r$ null hypotheses are true, probability of rejecting at least one is \pause \begin{eqnarray*} Pr\left\{ \cup_{j=1}^r A_j \right\} \pause & \leq & \sum_{j=1}^r Pr\{A_j\} \\ \pause & = & \sum_{j=1}^r \alpha/r \\ \pause & = & \alpha \end{eqnarray*} \pause Just use critical value(s) for $\alpha/r$ instead of $\alpha$. \end{frame} \begin{frame} \frametitle{Advantages and disadvantages of the Bonferroni correction} %\framesubtitle{} \begin{itemize} \item Advantage: Flexibility --- Applies to any collection of hypothesis tests. \pause \item Advantage: Easy to do. \pause \item Disadvantage: Must know what all the tests are before seeing the data. \pause \item Disadvantage: A little conservative; \pause the true joint significance level is less than $\alpha$. \end{itemize} \end{frame} \begin{frame} \frametitle{Application to Studentized deleted residuals} \pause %\framesubtitle{} \begin{itemize} \item There are $r=n$ tests, one for each observed $i = 1, \ldots, n$. \pause \item Use the critical value $t_{\frac{\alpha}{2n}}(n-k-2)$. \pause \item Even for large $n$ it is not overly conservative. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f16} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f16}} \end{frame} \end{document} \begin{eqnarray*} y_i & = & \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i \\ && \\ y_i & = & \widehat{\beta}_0 + \widehat{\beta}_1 x_{i,1} + \cdots + \widehat{\beta}_{p-1} x_{i,p-1} + e_i \end{eqnarray*} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Residual Plots with R Plot Residuals Against Independent Variables Not in the Equation E[Y|X1, X2] = beta0 + beta1 X1 + beta2 X2 Fit model with just X1 Say beta2 < 0 rm(list=ls()) rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) sprime("Sigma must be square.") if(dsig[1] != kk) sprime("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma,symmetric=T) sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) rmvn <- t(PP%*%sqrl%*%ZZ+mu) rmvn }# End of function rmvn set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = -1; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon cor(cbind(X1,X2,Y)) mod1 = lm(Y~X1); residual = mod1$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with just ',X[1]))) title(expression(paste('True model has both ', X[1], ' and ',X[2]))) ,,,,,,,,,, set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals; Yhat = mod3$fitted.values plot(Yhat,residual, ylab="Residual") title(expression(paste('Suspect Curvilinear Relationship with one or more X variables' ))) ,,,,,,,,,, Plot Residuals Against Independent Variables in the Equation Detect Curvilinear Relationship set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with E(Y|',bold(X), ') = ', beta[0]+beta[1]*X[1]+beta[2]*X[2] ))) title(expression(paste('Detect Curvilinear Relationship with ',X[2] ))) ,,,,,,,, Plot Residuals Against Independent Variables in the Equation: Detect Variance Increasing with X1 set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = 1 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] sig = seq(from=1,to=20,length.out=n) sig = sig[rank(X1)] # Sig an increasing function of X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon mod2 = lm(Y~X1+X2); Residual = mod2$residuals #$ plot(X1,Residual,xlab=expression(X[1])) title(expression(paste('Detect Variance Increasing with ',X[1] )))