% Residuals for STA302 % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on prime %\usetheme{Frankfurt} % Displays section titles on prime: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usetheme{AnnArbor} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{comment} \usepackage{alltt} % For colouring in verbatim environment % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Regression Diagnostics\footnote{See last slide for copyright information.}} \subtitle{STA302 Fall 2020} \date{} % To suppress date \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} \begin{frame} \frametitle{Diagnostics: Sort of a medical check-up of the regression model} \pause %\framesubtitle{} \begin{enumerate} \item Looking at residuals can sometimes reveal problems with the model or the data. \pause \item Are some observations having a large influence? \end{enumerate} \end{frame} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Residuals and Hat Values} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\widehat{\boldsymbol{\epsilon}}$ estimates $\boldsymbol{\epsilon}$.} \pause %\framesubtitle{$y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$} \pause \begin{itemize} \item If $\widehat{\boldsymbol{\epsilon}}$ does not act like $\boldsymbol{\epsilon}$ should\pause, investigate. \pause \item Perhaps fix the model or the data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\widehat{\boldsymbol{\epsilon}}$ estimates $\boldsymbol{\epsilon}$?} \framesubtitle{$y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$} \pause \begin{itemize} \item First of all, it's a little strange because $\boldsymbol{\epsilon}$ is random. \pause % It's true that epsilon is unknown, but usually we estimate parameters that are fixed constants. \item But they are analogous. \begin{itemize} \item $\widehat{\epsilon}_i$ are vertical distances of the $y_i$ from the estimated regression plane. \item $\epsilon_i$ are vertical distances of the $y_i$ from the true regression plane. \end{itemize} \pause \item The vector of residuals is defined as \begin{eqnarray*} && \widehat{\boldsymbol{\epsilon}} = \mathbf{y} - \widehat{\mathbf{y}} = \mathbf{y} - \mathbf{X}\widehat{\boldsymbol{\beta}}\\ \pause &\Rightarrow& \mathbf{y} = \mathbf{X}\widehat{\boldsymbol{\beta}} + \widehat{\boldsymbol{\epsilon}} \\ \pause \mbox{Compare} && \mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon} \end{eqnarray*} \pause \item Is it a good estimate? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Distribution: $\boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2\mathbf{I}_n)$, $\widehat{\boldsymbol{\epsilon}} \sim N_n(\mathbf{0},\sigma^2(\mathbf{I}_n-\mathbf{H}))$} \pause %\framesubtitle{} \begin{itemize} \item Both are multivariate normal with expected value zero. \pause \item $\widehat{\epsilon}_i$ do not have equal variance. \item $\widehat{\epsilon}_i$ are not independent. \pause \item It's not as bad as it seems, because most of $\mathbf{H}$ goes to zero as $n \rightarrow \infty$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Is $\widehat{\epsilon}_i$ close to $\epsilon_i$? Look at $\widehat{\boldsymbol{\epsilon}} - \boldsymbol{\epsilon}$.} \pause %\framesubtitle{} \begin{itemize} \item $\widehat{\boldsymbol{\epsilon}} - \boldsymbol{\epsilon}$ is multivariate normal. \pause \item $E(\widehat{\boldsymbol{\epsilon}} - \boldsymbol{\epsilon}) = \mathbf{0}-\mathbf{0} = \mathbf{0}$. \pause \end{itemize} \begin{eqnarray*} cov(\widehat{\boldsymbol{\epsilon}} - \boldsymbol{\epsilon}) & = & cov(\mathbf{y} - \widehat{\mathbf{y}} - \boldsymbol{\epsilon}) \\ \pause & = & cov(\mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon} - \widehat{\mathbf{y}} - \boldsymbol{\epsilon}) \\ \pause & = & cov(\mathbf{X} \boldsymbol{\beta} - \widehat{\mathbf{y}} ) \\ \pause & = & cov(- \widehat{\mathbf{y}} ) \\ \pause & = & cov(-\widehat{\mathbf{y}}, -\widehat{\mathbf{y}}) \\ \pause & = & cov(\widehat{\mathbf{y}} ) \\ \pause & = & \sigma^2\mathbf{H} \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\widehat{\boldsymbol{\epsilon}} - \boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2\mathbf{H})$} \pause %\framesubtitle{} \begin{itemize} \item Denoting $\mathbf{H}$ by $[h_{ij}]$, \pause $Var(\widehat{\epsilon}_i - \epsilon_i) = \sigma^2h_{ii}$. \pause \item Diagonal elements $h_{ii}$ of the hat matrix are sometimes called ``hat values." \pause \item Most of the hat values are small. Recall \pause \begin{eqnarray*} tr(\mathbf{H}) & = & tr\left( \mathbf{X}(\mathbf{X}^\prime\mathbf{X})^{-1} {\color{blue}\mathbf{X}^\prime} \right) \\ & = & tr\left( {\color{blue}\mathbf{X}^\prime}\mathbf{X} (\mathbf{X}^\prime\mathbf{X})^{-1} \right) \\ & = & tr\left( \mathbf{I}_{_{k+1}} \right) \\ & = & k+1 \end{eqnarray*} \pause \item So $\sum_{i=1}^n h_{ii} = k+1$ even as $n$ increases. \pause \item The average hat value goes to zero. \pause \item For large samples, $Var(\widehat{\epsilon}_i - \epsilon_i) = \sigma^2h_{ii}$ is very small most of the time, and $\widehat{\epsilon}_i$ is probably close to $\epsilon_i$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How about Independence?} \pause %\framesubtitle{} \begin{itemize} \item $cov(\widehat{\boldsymbol{\epsilon}}) = \sigma^2(\mathbf{I}_n-\mathbf{H})$, so the residuals are not independent. \pause \item Let the $n \times 1$ vector $\mathbf{v}_j$ be all zeros except for a one in position $j$. \pause \item Construct a \emph{selection matrix}: the $n \times 2$ partitioned matrix $\mathbf{S} = (\mathbf{v}_i|\mathbf{v}_j)$. \pause \begin{displaymath} \mathbf{S}^\prime \mathbf{H} \mathbf{S} = \left( \begin{array}{cc} h_{ii} & h_{ij} \\ h_{ij} & h_{jj} \end{array} \right) = \mathbf{M} \end{displaymath} \pause \item $\mathbf{M}$ is non-negative definite because $\mathbf{a}^\prime \mathbf{Ma} \pause = \mathbf{a}^\prime \mathbf{S}^\prime \mathbf{H} \mathbf{Sa} \pause = (\mathbf{Sa})^\prime \mathbf{H(Sa)} \pause = \mathbf{v}^\prime \mathbf{Hv} \pause \geq 0$. \pause \item So the eigenvalues of $\mathbf{M}$ are $\geq 0$. \pause \item $ \Longrightarrow |\mathbf{M}| \pause = h_{ii}h_{jj} - h_{ij}^2 \geq 0$. \pause \item $ \Longrightarrow h_{ii}h_{jj} \geq h_{ij}^2$. \pause \item $ \Longrightarrow |h_{ij}| \leq \sqrt{h_{ii}h_{jj}}$ \item And $h_{ij} \rightarrow 0$ if either $h_{ii} \rightarrow 0$ or $h_{jj} \rightarrow 0$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Conclusion: For large samples,} %\framesubtitle{} \begin{itemize} \item $\widehat{\epsilon}_i$ is a good approximation of $\epsilon_i$, as long as $h_{ii}$ is small. \item $\widehat{\epsilon}_i$ and $\widehat{\epsilon}_j$ are almost independent if either $h_{ii}$ is small or $h_{jj}$ is small (or both). \pause \item In this case, the $\widehat{\epsilon}_i$ should behave very much like the $\epsilon_i$ if the model is correct. \pause \item This is the basis of residual plots, where $\widehat{\epsilon}_i$ are treated as if they were $\epsilon_i$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another good thing about small hat values} \framesubtitle{Theorem 5.1 on p.~106 of Sen and Srivastava's \emph{Regression Analysis}} \pause If $\displaystyle \lim_{n \rightarrow\infty} \max_i h_{ii} = 0$, \pause then the distribution of $\widehat{\boldsymbol{\beta}}$ approaches a multivariate normal % \linebreak \linebreak $N_{k+1}(\boldsymbol{\beta},\sigma^2(\mathbf{X}^\prime \mathbf{X})^{-1})$, even if the distribution of the $\epsilon_i$ is not % \linebreak normal. \pause \vspace{10mm} In this case, tests and confidence intervals based on the normal distribution are roughly okay for large samples (details omitted). \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What is a ``small" hat value?} \pause %\framesubtitle{} \begin{itemize} \item Because $\mathbf{H}$ is non-negative definite, $h_{ii} \geq 0$. \pause \item Because $\mathbf{I-H}$ is non-negative definite, $1-h_{ii} \geq 0 \pause \iff h_{ii} \leq 1$. \pause \item So mathematically, $0 \leq h_{ii} \leq 1$. \pause \item Rule of thumb:~\pause Worry about $h_{ii} > \frac{2(k+1)}{n}$ (Page 236). \pause \item Another rule of thumb (for multivariate normality of $\widehat{\boldsymbol{\beta}}$) is worry about $h_{ii} > 0.2$. \pause \item Or just look at a histogram of hat values (Page 236). \end{itemize} \end{frame} % Decent homework about this. 0 < h_ii (strictly) and in fact 1/n < h_ii. See text for proof of the latter. Unfortunately, it depends on the centered model, which we have not done yet. Also requires intercept. Maybe later in the course. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What causes large $h_{ii}$ values?} \pause %\framesubtitle{} \begin{itemize} \item They correspond to multivariate outliers in the $x$ variables. \pause \item The hat value $h_{ii}$ is an increasing function of the distance from the vector $\mathbf{x}^\prime_i$ and the vector of sample means $(1,\overline{\mathbf{x}}_1, \overline{\mathbf{x}}_2, \ldots, \overline{\mathbf{x}}_k)^\prime$. \pause \item See Theorem 9.2 (iii) on p.~231. \end{itemize} \end{frame} % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Multivariate outliers can be hard to spot} %\framesubtitle{} \begin{center} \includegraphics[width=3.1in]{Outlier} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Leverage} \framesubtitle{Hat values $h_{ii}$ are sometimes called ``leverage" values} \begin{center} \includegraphics[width=3.1in]{Leverage} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Easy Moral of the Story} \pause %\framesubtitle{} \begin{itemize} \item Start by checking for large hat values. \pause \item Look for $h_{ii} > \frac{2(k+1)}{n}$ or $h_{ii} > 0.2$. \pause \item Plots are useful -- maybe just a histogram. \pause \item If hat values are big, look at the $x$ values. \pause \item If the hat values are okay, start looking at residuals. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Residual Plots} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plotting residuals can be helpful} \pause % \framesubtitle{} \begin{itemize} \item Against predicted $y$. \pause \item Against explanatory variables not in the equation. \pause \item Against explanatory variables in the equation. \pause \item Against time. \pause \item Look for serious departures from normality, outliers. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables Not in the Equation} \pause %\framesubtitle{} \begin{center} \includegraphics[width=2.8in]{resplot1} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against $\widehat{y}$} \pause %\framesubtitle{} \begin{center} \includegraphics[width=3in]{resplot2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Plot versus $X_1$ showed nothing} \pause \begin{center} \includegraphics[width=2.75in]{resplot3} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Predicted $y$} \framesubtitle{Can show non-constant variance} \pause \begin{center} \includegraphics[width=3in]{resplot3b} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Explanatory Variables in the Equation} \framesubtitle{Can show non-constant variance} \pause \begin{center} \includegraphics[width=2.75in]{resplot4} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Faraway has some suggestions} \framesubtitle{\emph{Linear models with R}} \pause \begin{itemize} \item Plot $\widehat{y}$ by $|\widehat{\epsilon}|$\pause, making change in spread easier to see. \pause \item Do a regression with $x=\widehat{y}$ and $y = |\widehat{\epsilon}|$, and look at the test of $H_0: \beta_1=0$. \pause \item If the model is correct, $\widehat{y}$ and $|\widehat{\epsilon}|$ should be independent. \pause \item The distribution theory behind the test does not quite work, but it can give a rough indication to supplement your inspection of the plots. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Time, if the data are time ordered} \pause %\framesubtitle{} \begin{itemize} \item You really need to watch out for time ordered data. \pause \item Regression methods from this course may not be appropriate. \pause \item The problem is that $\mathbf{\epsilon}$ represents all other variables that are left out of the regression equation. \pause \item Some of them could be time dependent. \pause \item This would make the $\epsilon_i$ non-independent, possibly yielding misleading results. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Time} \framesubtitle{There should be no visible pattern} \pause \begin{center} \includegraphics[width=3in]{tsplot1a} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Plot Residuals Against Time} \framesubtitle{There should be no visible pattern} \begin{center} \includegraphics[width=3in]{tsplot1b} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Autocorrelated errors} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{It's not always so easy} %\framesubtitle{} \begin{columns} \column{0.5\textwidth} \includegraphics[width=2in]{tsplot1b} \column{0.5\textwidth} \begin{itemize} \item Looks like an increasing trend. We will include time in the model. \pause \item But it's not always so clear. \pause \item A test would be nice. \pause \item The key is that the higher $\widehat{\epsilon}_t$ is, the higher $\widehat{\epsilon}_{t+1}$ tends to be. \pause \item This is typical of many time series structures, not just trends. \end{itemize} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Lagged variables} \framesubtitle{Assume the observations (cases) are ordered in time} \pause \begin{itemize} \item The value of a variable lag one is the value of the variable one time period ago. \pause \item Like yesterday's high temperature. \pause \item The value of a variable lag six is the value of the variable six time periods ago. \pause \item Regression with lagged $x$ variables (and maybe un-lagged as well) is a natural thing to do. \pause \item If a lagged $x$ variable is related to $y$, it could be called a ``leading indicator." \pause \item Like a leading indicator of number of deaths from covid-19 could be number of covid-19 infections four weeks ago. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Lagged Residuals} %\framesubtitle{} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{cc} Residual & Residual Lag One \\ \hline $\widehat{\epsilon}_1$ & \texttt{NA} \\ $\widehat{\epsilon}_2$ & $\widehat{\epsilon}_1$ \\ $\widehat{\epsilon}_3$ & $\widehat{\epsilon}_2$ \\ $\widehat{\epsilon}_4$ & $\widehat{\epsilon}_3$ \\ \vdots & \vdots \\ $\widehat{\epsilon}_{n-1}$ & $\widehat{\epsilon}_{n-2}$ \\ $\widehat{\epsilon}_n$ & $\widehat{\epsilon}_{n-1}$ \\ \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \pause Compute the sample correlation. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sample autocorrelation} \pause %\framesubtitle{} \begin{itemize} \item Correlation between a variable and its lag one is called an \emph{autocorrelation}. \pause \item Specifically, the first order autocorrelation. \pause \item Correlation with lag 2 is the second order autocorrelation, etc. \pause \item The sample autocorrelation is an estimate of the population autocorrelation. \pause \item Of the $\epsilon_i$ values, not just the $\widehat{\epsilon}_i$. \pause \item Can reveal lack of independence. \pause \item The most common form is positive autocorrelation. \pause \item The colder is was yesterday, the colder it will probably be today. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Durbin-Watson Statistic} \framesubtitle{Assuming the data are in time order} {\LARGE \begin{displaymath} d = \frac{\sum_{i=1}^n(\hat{\epsilon}_i - \hat{\epsilon}_{i-1})^2} {\sum_{i=2}^n\hat{\epsilon}_i^2} \end{displaymath} \pause } % End size \vspace{5mm} If successive $\widehat{\epsilon}_i$ are too close together, $d$ will be small. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{More about Durbin-Watson} % \framesubtitle{} \begin{displaymath} d = \frac{\sum_{i=1}^n(\hat{\epsilon}_i - \hat{\epsilon}_{i-1})^2} {\sum_{i=2}^n\hat{\epsilon}_i^2} \end{displaymath} \vspace{2mm} \pause \begin{itemize} \item $ d \approx 2(1-\widehat{\rho})$, where $\widehat{\rho}$ is the sample autocorrelation of the residuals. \pause \item $d=2$ means zero autocorrelation. \pause \item $0 \leq d \leq 4$. \pause \item Small values of $d$ mean positive autocorrelation. \pause \item Rule of thumb is worry if $d<1$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Positive autocorrelation} \pause %\framesubtitle{} When the $\epsilon_i$ values are positively autocorrelated, \pause \begin{itemize} \item $\widehat{\boldsymbol{\beta}}$ is still unbiased and consistent. \pause \item But \emph{MSE} underestimate $\sigma^2$. \pause \item Confidence intervals and prediction intervals are too narrow. \pause \item Tests are too likely to reject a true null hypothesis. \pause \item The Durbin-Watson test is really useful. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Back to the Example} \framesubtitle{} \begin{center} \includegraphics[width=3in]{tsplot1a} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Original Model} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> tmod1 = lm(Y~X1+X2) > # Only need to install package once > # install.packages("lmtest", dependencies=TRUE) > # Wow, a lot of stuff. > library(lmtest) > # help(dwtest) > dwtest(tmod1) } Durbin-Watson test data: tmod1 DW = 0.74561, p-value = 1.106e-10 alternative hypothesis: true autocorrelation is greater than 0 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Add Time to the Model} \pause %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> tmod2 = lm(Y~X1+X2+Time); Residual2 = residuals(tmod2) > summary(tmod2) } Call: lm(formula = Y ~ X1 + X2 + Time) Residuals: Min 1Q Median 3Q Max -13.5978 -4.4346 0.0868 3.8548 14.2158 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 3.89219 3.74819 1.038 0.302 X1 1.04889 0.07201 14.567 <2e-16 *** X2 -0.99279 0.06939 -14.308 <2e-16 *** Time 0.21564 0.02065 10.443 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 5.916 on 96 degrees of freedom Multiple R-squared: 0.7921, Adjusted R-squared: 0.7856 F-statistic: 121.9 on 3 and 96 DF, p-value: < 2.2e-16 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Plot Residuals Against Time} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> plot(Time,Residual2,ylab = 'Residual from Model 2') > lines(Time,Residual2) > tstring = expression(paste('Plot of time by residual from model with + E(y|',bold(x),') = ', beta[0]+beta[1]*x[1]+beta[2]*x[2]+beta[3]*t )) > title(tstring) } \end{alltt} } % End size \vspace{-8mm} \begin{center} \includegraphics[width=2.5in]{tsplot2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Durbin-Watson} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> dwtest(tmod2) } Durbin-Watson test data: tmod2 DW = 1.5432, p-value = 0.007792 alternative hypothesis: true autocorrelation is greater than 0 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Try a Cubic in Time} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # Okay, maybe a cubic > Time2 = Time^2; Time3 = Time^3 > tmod3 = lm(Y~X1+X2+Time+Time2+Time3) > Residual3 = residuals(tmod3) > anova(tmod2,tmod3) } Analysis of Variance Table Model 1: Y ~ X1 + X2 + Time Model 2: Y ~ X1 + X2 + Time + Time2 + Time3 Res.Df RSS Df Sum of Sq F Pr(>F) 1 96 3359.9 2 94 2928.4 2 431.54 6.926 0.001563 ** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Summary} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> summary(tmod3) } Call: lm(formula = Y ~ X1 + X2 + Time + Time2 + Time3) Residuals: Min 1Q Median 3Q Max -12.3439 -3.6473 0.1622 3.4106 12.5547 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.183e+01 4.225e+00 2.801 0.006192 ** X1 1.081e+00 6.857e-02 15.770 < 2e-16 *** X2 -1.040e+00 6.692e-02 -15.539 < 2e-16 *** Time -5.288e-01 2.018e-01 -2.621 0.010238 * Time2 1.702e-02 4.614e-03 3.688 0.000378 *** Time3 -1.064e-04 2.993e-05 -3.556 0.000591 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 5.582 on 94 degrees of freedom Multiple R-squared: 0.8188, Adjusted R-squared: 0.8091 F-statistic: 84.94 on 5 and 94 DF, p-value: < 2.2e-16 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Plot Residuals} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> plot(Time,Residual3,ylab = 'Residual from Model 3') > lines(Time,Residual3) > tstring = expression(paste('E(y|',bold(x), + ') = ', beta[0]+beta[1]*x[1]+beta[2]*x[2]+beta[3]*t+beta[4]*t^2+beta[5]*t^3)) > title(tstring) } \end{alltt} } % End size \vspace{-6mm} \begin{center} \includegraphics[width=2.5in]{tsplot3} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Durbin-Watson Test} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}dwtest(tmod3) } Durbin-Watson test data: tmod3 DW = 1.7636, p-value = 0.06464 alternative hypothesis: true autocorrelation is greater than 0 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Watch Out} %\framesubtitle{} \begin{itemize} \item Including time in the model is primitive, but effective if all you have is a trend. \pause \item You really do have to be careful about using ordinary least squares regression on time series data. \pause \item With positive autocorrelation, each observation tends to be close to the last one, and they can drift. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=3.4in]{1} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{3} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{4} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{5} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Random walk} \framesubtitle{Sometimes called Drunkard's walk} \begin{itemize} \item Take a step left or right at random. \pause \item Steps could be of variable length. \pause \item Location at time $t$ depends on location at time $t-1$. \end{itemize} \pause {\LARGE \begin{displaymath} X_t = X_{t-1} + \epsilon_t \end{displaymath} \pause } % End size $\epsilon_1, \epsilon_2, \ldots$ all independent and identically distributed. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Correlations: 50 pairs of independent random walks, $n=1000$ steps} \framesubtitle{Need around $|r| = 0.13$ for significance} \pause {\footnotesize % or scriptsize \begin{verbatim} -0.28175 -0.22242 -0.32170 -0.45053 0.07866 0.59167 -0.27414 -0.82570 -0.62175 0.43537 0.84147 0.04103 -0.17502 -0.89710 -0.19116 -0.53865 -0.50889 0.42855 -0.91074 0.90577 0.22818 0.84834 -0.52501 0.82583 -0.06838 -0.00234 0.16084 0.81393 -0.07063 -0.09908 -0.38405 -0.28510 0.24850 0.12445 0.33509 0.33586 0.41241 -0.33482 -0.32021 -0.73808 0.14045 -0.03618 -0.67757 0.81121 -0.39379 -0.58832 -0.26866 0.16687 0.38541 0.12433 \end{verbatim} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{If you do ordinary regression on time series data} \pause %\framesubtitle{} \begin{itemize} \item Plot the residuals against time. \pause \item Look at the Durbin-Watson test. \pause \item Try to include relevant time-varying predictor variables. \pause \item Learn about genuine time series methods (STA457). \pause \item If you study time series, don't stick your nose up at univariate time series methods. Apply them to the residuals! \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Outlier detection} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Outlier detection} \pause %\framesubtitle{} \begin{itemize} \item Big residuals may be outliers. What's ``big?" \pause \item Consider standardizing. \pause \item But note that variances of $\widehat{\epsilon}_i$ are not all the same. \pause \item Semi-Studentized: Estimate $Var(\widehat{\epsilon}_i)$ and divide by square root of that: $\frac{\widehat{\epsilon}_i}{\sqrt{MSE\,(1-h_{i,i})}}$ \pause \item In R, this is produced with \texttt{rstandard}. % rstandard \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Studentized deleted residuals} \framesubtitle{The idea} \pause \begin{itemize} \item An outlier will make \emph{MSE} big. \pause \item In that case, the standardized (Semi-Studentized) residual $\frac{\widehat{\epsilon}_i}{\sqrt{MSE\,(1-h_{i,i})}}$ will be too small -- less noticeable. \pause \item So calculate $\widehat{y}$ for each observation based on all the other observations, but not that one. \pause Leave one out. \pause \item Predict each observed $y$ based on all the others, and assess error of prediction (divided by standard error). \pause \item Big values suggest that the expected value of $y_i$ is not what it should be. \pause \item Maybe that observation is from a different domain -- investigate. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Apply prediction interval technology} \pause %\framesubtitle{} \begin{displaymath} t = \frac{y_{0}-\mathbf{x}_{0}^\prime \widehat{\boldsymbol{\beta}}} {\sqrt{MSE(1+\mathbf{x}_{0}^\prime (\mathbf{X}^\prime \mathbf{X})^{-1}\mathbf{x}_{0})}} \sim t(n-k-1) \end{displaymath} \pause \begin{itemize} \item Note that $y_i$ is now being called $y_{0}$. \pause \item If the ``prediction" is too far off there is trouble. \pause \item Use $t$ as a test statistic. \pause \item Need to change the notation. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Studentized deleted residual} %\framesubtitle{} \begin{displaymath} t_i = \frac{y_i-\mathbf{x}_i^\prime \widehat{\boldsymbol{\beta}}_{(i)}} {\sqrt{MSE_{(i)}(1+\mathbf{x}_i^\prime (\mathbf{X}_{(i)}^\prime \mathbf{X}_{(i)})^{-1}\mathbf{x}_i)}} \sim t(n-k-2) \end{displaymath} \pause \begin{itemize} \item In R, this is produced with \texttt{rstudent}. \pause \item There is a more efficient formula. \pause \item Use $t_i$ as a test statistic of $H_0: E(y_i) = \mathbf{x}_i^\prime \boldsymbol{\beta}$. \pause \item If $H_0$ is rejected, investigate. \pause \item We are doing $n$ tests. \pause \item If all null hypotheses are true (no outliers), there is still a good chance of rejection at least one $H_0$. \pause \item Type I errors are very time consuming and disturbing. \pause \item How about a Bonferroni correction? \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Bonferroni Correction for Multiple Tests} %\framesubtitle{} \begin{itemize} \item Do the tests as usual, obtaining $n$ test statistics. \pause \item For each test, use the significance level $\alpha/n$ instead of $\alpha$. \pause \item Use the critical value $t_{\frac{\alpha}{2n}}(n-k-2)$. \pause \item Even for large $n$ it is not overly conservative. \pause \item If you locate an outlier, \textbf{investigate}! \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Normality} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Normality} \pause %\framesubtitle{} \begin{itemize} \item Instead of checking the residuals for normality\pause, I like to check the Studentized deleted residuals (\texttt{rstudent}). \pause \item Their variances are all equal. \pause \item And for a healthy sample size, $t$ is almost $z$. \pause \item Start with \texttt{hist()}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{QQ Plots} \pause %\framesubtitle{} \begin{itemize} \item Plot ordered values of a variable against the expected values of the order statistics under normality. \pause \item If the distribution is normal, the plot should be approximately straight line. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{\texttt{qqnorm}} \pause %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> help(qqnorm) > x1 = rnorm(400) > qqnorm(x1); qqline(x1) } \end{alltt} } % End size \begin{center} \includegraphics[width=2.5in]{qq1} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{The Shapiro-Wilk Test for Normality} \pause %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> help(shapiro.test) > shapiro.test(x1) } Shapiro-Wilk normality test data: x1 W = 0.99574, p-value = 0.3527 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Non-normal data} \pause %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> x2 = rexp(400) > qqnorm(x2); qqline(x2) } \end{alltt} } % End size \begin{center} \includegraphics[width=2.5in]{qq2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Test for Normality} \framesubtitle{\texttt{x2} is exponential} \pause {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> shapiro.test(x2) } Shapiro-Wilk normality test data: x2 W = 0.81528, p-value < 2.2e-16 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Influential Observations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Influential Observations} \pause %\framesubtitle{} \begin{itemize} \item Based on the idea of leverage, look for large hat values $h_{ii}$. \pause \item If $h_{ii} > 0.2$ or $h_{ii} > \frac{2(k+1)}{n}$, investigate. \pause \item Other methods are based on leave-one-out technology. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Leave One Out} \pause %\framesubtitle{} \begin{itemize} \item DFBETA = $\widehat{\boldsymbol{\beta}} - \widehat{\boldsymbol{\beta}}_{(i)} \pause = \frac{(\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{x}_i \widehat{\epsilon}_i} {1-h_{ii}}$. \pause \item DFBETAS: Use $t_i$ instead of $\widehat{\epsilon}_i$. \pause \item DFFIT = $\widehat{y}_i - \widehat{y}_{(i)}\pause = \frac{h_{ii}\widehat{\epsilon}_i}{1-h_{ii}}$. \pause \item DFFITS: Use $t_i$ instead of $\widehat{\epsilon}_i$. \pause \item Cook's distance: $D_i = \frac{\sum_{i=1}^n(\widehat{y}_i - \widehat{y}_{(i)})^2} {MSE(k+1)} \pause = \left( \frac{1}{k+1}\right) t_i^2 \left(\frac{h_{ii}}{1-h_{ii}}\right)$. \pause \item They say worry about $D_i>1$. \pause \item[] \item If any of these measures is a lot bigger than the others, investigate. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f20} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f20}} \end{frame} \end{document} \begin{eqnarray*} y_i & = & \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i \\ && \\ y_i & = & \widehat{\beta}_0 + \widehat{\beta}_1 x_{i,1} + \cdots + \widehat{\beta}_{p-1} x_{i,p-1} + e_i \end{eqnarray*} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Produces Outlier.pdf rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) stop("Sigma must be square.") if(dsig[1] != kk) stop("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma,symmetric=T) sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) rmvn <- t(PP%*%sqrl%*%ZZ+mu) rmvn }# End of function rmvn Sigma = rbind(c(1,.85), c(.85,1)); Sigma set.seed(9999) X = rmvn(100,c(0,0),Sigma) plot(X,xlab = expression(paste(x[1])) ,ylab = expression(paste(x[2]))) points(1.5,-1.5,pch=16) title("Hidden Outlier") # Produces Leverage.pdf x = c(X[,1],8); y = c(X[,2],-2) plot(x,y,xlab = expression(paste(x[1])) ,ylab = expression(paste(x[2]))) points(8,-2,pch=16) mod = lm(y~x) lines(x,fitted.values(mod)) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Residual Plots with R Plot Residuals Against Independent Variables Not in the Equation E[Y|X1, X2] = beta0 + beta1 X1 + beta2 X2 Fit model with just X1 Say beta2 < 0 rm(list=ls()) rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) sprime("Sigma must be square.") if(dsig[1] != kk) sprime("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma,symmetric=T) sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) rmvn <- t(PP%*%sqrl%*%ZZ+mu) rmvn }# End of function rmvn set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = -1; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon cor(cbind(X1,X2,Y)) mod1 = lm(Y~X1); residual = mod1$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with just ',X[1]))) title(expression(paste('True model has both ', X[1], ' and ',X[2]))) ,,,,,,,,,, set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals; Yhat = mod3$fitted.values plot(Yhat,residual, ylab="Residual") title(expression(paste('Suspect Curvilinear Relationship with one or more X variables' ))) ,,,,,,,,,, Plot Residuals Against Independent Variables in the Equation Detect Curvilinear Relationship set.seed(999) n = 50 b0 = 10; b1 = 1; b2 = 1/10; sig = 10 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] % X2 = pnorm(X2,50,10) * 80 # Uniform on (0,80) but correlated with X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + 10*sqrt(900-(X2-50)^2) + epsilon cor(cbind(X1,X2,Y)) mod3 = lm(Y~X1+X2); residual = mod3$residuals #$ plot(X2,residual,xlab=expression(X[2]), ylab=expression(paste('Residual from model with E(Y|',bold(X), ') = ', beta[0]+beta[1]*X[1]+beta[2]*X[2] ))) title(expression(paste('Detect Curvilinear Relationship with ',X[2] ))) ,,,,,,,, Plot Residuals Against Independent Variables in the Equation: Detect Variance Increasing with X1 set.seed(9999) n = 50 b0 = 10; b1 = 1; b2 = 1 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] sig = seq(from=1,to=20,length.out=n) sig = sig[rank(X1)] # Sig an increasing function of X1 epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + epsilon mod2 = lm(Y~X1+X2); Residual = mod2$residuals #$ # Try against y-hat yhat = fitted(mod2) plot(yhat,Residual,xlab='Predicted y', main = 'Detect non-constant variance') # Try against X1 plot(X1,Residual,xlab=expression(X[1])) title(expression(paste('Detect Variance Increasing with ',X[1] ))) plot(yhat,abs(Residual),xlab='Predicted y',ylab = 'Absolute residual') summary(lm(abs(Residual) ~ yhat)) ,,,,,,,, Plot residuals against time a) Straight trend # E[Y|X1, X2] = beta0 + beta1 X1 + beta2 X2 + g(time) rm(list=ls()) rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) sprime("Sigma must be square.") if(dsig[1] != kk) sprime("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma,symmetric=T) sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) rmvn <- t(PP%*%sqrl%*%ZZ+mu) rmvn }# End of function rmvn set.seed(9999) n = 100 b0 = 10; b1 = 1; b2 = -1; sig = 5 Phi = rbind(c(100,50), c(50,100)) IV = rmvn(n,c(50,50),Phi) X1 = IV[,1]; X2 = IV[,2] Time = 1:n gTime = 15*pnorm((Time-n/2)/10) # Nice asymptotic plot(Time,gTime,type='l') epsilon = rnorm(n,0,sig) Y = b0 + b1*X1 + b2*X2 + gTime + epsilon #Y = b0 + b1*X1 + b2*X2 + epsilon cor(cbind(X1,X2,Time,Y)) tmod1 = lm(Y~X1+X2); Residual = residuals(tmod1) plot(Time,Residual) # Makes tsplot1a.pdf tstring = expression(paste('Plot of time by residual from model with E(y|',bold(x), ') = ', beta[0]+beta[1]*x[1]+beta[2]*x[2] )) title(tstring) lines(Time,Residual) # Makes tsplot1b.pdf # herehere # Might as well try to do something about it. # The easiest thing is to put time in the model, but try D-W. # install.packages("lmtest", dependencies=TRUE) # Only need to do this once # Wow, a lot of stuff. library(lmtest) # help(dwtest) dwtest(tmod1) # DW = 0.48529, p-value = 9.15e-15 tmod2 = lm(Y~X1+X2+Time); Residual2 = residuals(tmod2) summary(tmod2) plot(Time,Residual2,ylab = 'Residual from Model 2') lines(Time,Residual2) tstring = expression(paste('Plot of time by residual from model with E(y|',bold(x), ') = ', beta[0]+beta[1]*x[1]+beta[2]*x[2]+beta[3]*t )) title(tstring) dwtest(tmod2) # DW = 1.3595, p-value = 0.0004077 # Okay, maybe a cubic Time2 = Time^2; Time3 = Time^3 tmod3 = lm(Y~X1+X2+Time+Time2+Time3) Residual3 = residuals(tmod3) anova(tmod2,tmod3) summary(tmod3) plot(Time,Residual3,ylab = 'Residual from Model 3') lines(Time,Residual3) tstring = expression(paste('E(y|',bold(x), ') = ', beta[0]+beta[1]*x[1]+beta[2]*x[2]+beta[3]*t+beta[4]*t^2+beta[5]*t^3)) title(tstring) dwtest(tmod3) # DW = 1.971, p-value = 0.3204 # Pretty clean. Point out that # This is primitive but effective if all you have is a trend. # More commonly, there are omitted x variables that are changing systematically over time. # Including them will help a lot, if you can. # The Durbin-Watson test will help. # If you can't, or it does not do the job completely, may have to go to genuine time series regression methods -- STA457 # If you study time series, don't stick your nose up at univariate time series methods. Apply them to the residuals! %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Big hat values} \pause %\framesubtitle{} \begin{itemize} \item $\widehat{\epsilon}_i$ is a good approximation of $\epsilon_i$, as long as $h_{ii}$ is not too big. \pause \item What is a ``big" hat value? \pause \item Because $\mathbf{H}$ is non-negative definite, $h_{ii} \geq 0$. \pause \item Because $\mathbf{I-H}$ is non-negative definite, $1-h_{ii} \geq 0 \pause \iff h_{ii} \leq 1$. \pause \item So $0 \leq h_{ii} \leq 1$. \end{itemize} \end{frame} \item Diagonal elements $h_{ii}$ of the hat matrix are sometimes called ``hat values." \pause $\boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2\mathbf{I}_n)$, while $\widehat{\boldsymbol{\epsilon}} \sim N_n(\mathbf{0},\sigma^2(\mathbf{I}_n-\mathbf{H}):$ \pause \begin{frame} \frametitle{$\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$} \framesubtitle{$y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$} \pause \begin{itemize} \item What is $\boldsymbol{\epsilon}$? \pause \item Everything else about $\mathbf{y}$ that's not in $\mathbf{X}$. \pause \item We wish we could study $\boldsymbol{\epsilon}$, but it's invisible. \pause \item \pause \item \end{itemize} \end{frame} \begin{eqnarray*} \mathbf{a}^\prime \mathbf{Ma} & = & \mathbf{a}^\prime \mathbf{S}^\prime \mathbf{H} \mathbf{Sa} \\ \pause & = & \mathbf{a}^\prime \mathbf{S}^\prime \mathbf{HH} \mathbf{Sa} \\ \pause & = & (\mathbf{Sa})^\prime \mathbf{H} \mathbf{Sa} \\ \pause & = & \\ \pause \end{eqnarray*}