% Weighted Least Squares and Generalized Least Squares for STA302 % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols \usetheme{Berlin} % Displays sections on prime %\usetheme{Frankfurt} % Displays section titles on prime: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} % \usetheme{AnnArbor} % Yellow and blue, good for large number of sections \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{comment} \usepackage{alltt} % For colouring in verbatim environment % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Weighted Least Squares and Generalized Least Squares\footnote{See last slide for copyright information.}} \subtitle{STA302 Fall 2020} \date{} % To suppress date \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Weighted and Generalized Least Squares} \framesubtitle{An antidote to unequal variance (of a certain kind)} \pause Example: Aggregated data.Teaching evaluations. \pause Have \begin{itemize} \item Mean ratings $\overline{y}_1, \ldots, \overline{y}_m$ \pause \item Number of students $n_1, \ldots, n_m$ \pause \item Lots of predictor variables. \pause \end{itemize} \vspace{5mm} {\LARGE \begin{displaymath} Var(\overline{y}_i) = \frac{\sigma^2}{n_i} \end{displaymath} } % End Size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Residual Plot} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{ResidualPlot} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Model: $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$} %\framesubtitle{} \begin{eqnarray*} cov(\boldsymbol{\epsilon}) &=& \left( \begin{array}{cccc} \frac{\sigma^2}{n_1} & 0 & \cdots & 0 \\ 0 & \frac{\sigma^2}{n_2} & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & \frac{\sigma^2}{n_{m}} \end{array} \right) \\ \pause && \\ % && \\ &=& \sigma^2\left( \begin{array}{cccc} \frac{1}{n_1} & 0 & \cdots & 0 \\ 0 & \frac{1}{n_2} & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & \frac{1}{n_m} \end{array} \right) \end{eqnarray*} \pause Unknown $\sigma^2$ times a \emph{known} matrix. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Generalize} \pause %\framesubtitle{} \begin{itemize} \item $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$ \item $cov(\boldsymbol{\epsilon}) = \sigma^2\mathbf{V}$ \pause \item $\mathbf{V}$ is a \emph{known} symmetric positive definite matrix. \pause \item A good estimate of $\mathbf{V}$ can be substituted and everything works out for large samples. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Generalized Least Squares} \framesubtitle{Transform the data.} %{\LARGE \begin{eqnarray*} \mathbf{y} & = & \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}, \pause \mbox{ with } cov(\boldsymbol{\epsilon}) = \sigma^2\mathbf{V} \\ \pause \Longrightarrow \mathbf{V}^{-\frac{1}{2}}\mathbf{y} \pause & = & \mathbf{V}^{-\frac{1}{2}}\mathbf{X} \boldsymbol{\beta} + \mathbf{V}^{-\frac{1}{2}}\boldsymbol{\epsilon} \\ \pause \mathbf{y}^* & = & \mathbf{X}^* \boldsymbol{\beta} + \boldsymbol{\epsilon}^* \\ \end{eqnarray*} \pause %} % End size Same $\boldsymbol{\beta}$. \pause \begin{eqnarray*} cov(\boldsymbol{\epsilon}^*) & = & cov(\mathbf{V}^{-\frac{1}{2}}\boldsymbol{\epsilon}) \\ \pause & = & \mathbf{V}^{-\frac{1}{2}}cov(\boldsymbol{\epsilon}) \mathbf{V}^{-\frac{1}{2}\prime} \\ \pause & = & \mathbf{V}^{-\frac{1}{2}}(\sigma^2 \mathbf{V}) \mathbf{V}^{-\frac{1}{2}} \\ \pause & = & \sigma^2 \mathbf{I} \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Least Squares Estimate for the * Model is B.L.U.E.} \framesubtitle{$\mathbf{y}^* = \mathbf{X}^* \boldsymbol{\beta} + \boldsymbol{\epsilon}^*$} \pause \begin{itemize} \item By Gauss-Markov, $\widehat{\boldsymbol{\beta}}^*$ will beat any other linear combination of the $\mathbf{y}^*$. \pause \item $\mathbf{y}^* = \mathbf{V}^{-1/2} \mathbf{y}$. \pause \item So any linear combination of the $\mathbf{y}$ is a linear combination of the $\mathbf{y}^*$. \pause \begin{eqnarray*} \mathbf{c}^\prime \mathbf{y} \pause & = & \mathbf{c}^\prime \mathbf{V}^{1/2} \mathbf{V}^{-1/2} \mathbf{y} \\ \pause & = & \mathbf{c}^\prime \mathbf{V}^{1/2} \mathbf{y}^* \\ \pause & = & \mathbf{c}_2^\prime \mathbf{y}^* \end{eqnarray*} \pause \item And $\widehat{\boldsymbol{\beta}}^*$ beats it. It's B.L.U.E. for the original problem. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Generalized Least Squares} \framesubtitle{$\mathbf{y}^* = \mathbf{X}^* \boldsymbol{\beta} + \boldsymbol{\epsilon}^*$} \vspace{-5mm} {\Large \begin{eqnarray*} \widehat{\boldsymbol{\beta}}^* \pause & = & (\mathbf{X}^{*\prime} \mathbf{X}^*)^{-1} \mathbf{X}^{*\prime}\mathbf{y}^* \\ \pause & = & \left((\mathbf{V}^{-\frac{1}{2}}\mathbf{X})^\prime (\mathbf{V}^{-\frac{1}{2}}\mathbf{X})\right)^{-1} (\mathbf{V}^{-\frac{1}{2}}\mathbf{X})^\prime \mathbf{V}^{-\frac{1}{2}}\mathbf{y}^* \\ \pause & = & (\mathbf{X}^\prime \mathbf{V}^{-\frac{1}{2}\prime} \mathbf{V}^{-\frac{1}{2}}\mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{V}^{-\frac{1}{2}\prime} \mathbf{V}^{-\frac{1}{2}}\mathbf{y} \\ \pause & = & (\mathbf{X}^\prime \mathbf{V}^{-1} \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{V}^{-1} \mathbf{y} \end{eqnarray*} \pause } % End size \vspace{-5mm} \begin{itemize} \item So it is not necessary to literally transform the data. \pause \item Convenient expressions for tests and confidence intervals are only a homework problem away. \pause \item $ \widehat{\boldsymbol{\beta}}^*$ is called the ``generalized least squares" estimate of $\boldsymbol{\beta}$. \pause \item If $\mathbf{V}$ is diagonal, it's called ``weighted least squares." \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Variance Proportional to $x_i$} \framesubtitle{$y_i = \beta_0 + \beta_1 x_i + \epsilon_i$} \pause {\LARGE \begin{displaymath} cov(\boldsymbol{\epsilon}) = \sigma^2\left( \begin{array}{cccc} x_1 & 0 & \cdots & 0 \\ 0 & x_2 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & x_n \end{array} \right) \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{First-order Autoregressive Time Series} \framesubtitle{Estimate $\rho$ with the first-order sample autocorrelation} {\Large \begin{displaymath} cov(\boldsymbol{\epsilon}) = \sigma^2 \left( \begin{array}{c c c c c c} 1 & \rho & \rho^2 & \rho^3 & \cdots & \rho^{n-1}\\ \rho & 1 & \rho & \rho^2 & \cdots & \rho^{n-2} \\ \rho^2 & \rho & 1 & \rho & \cdots & \rho^{n-3} \\ \rho^3 & \rho^2 & \rho & 1 & \cdots & \rho^{n-4} \\ \vdots & \vdots & \vdots & \vdots & \ddots & \vdots \\ \rho^{n-1} & \rho^{n-2}& \rho^{n-3} & \rho^{n-4} & \cdots & 1 \end{array} \right) \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{An amazing scalar example with no independent variables} \pause %\framesubtitle{} \begin{itemize} \item $y_{ij} \stackrel{i.i.d.}{\sim} ~ ?(\mu,\sigma^2)$. \pause \item Have $\overline{y}_1, \ldots, \overline{y}_m$ \pause based on $n_1, \ldots, n_m$. \pause \item $\overline{y}_j \stackrel{\cdot}{\sim} N(\mu,\frac{\sigma^2}{n_j})$ by the Central Limit Theorem.\pause \item Want to estimate $\mu$. \pause \item A natural estimator is the mean of means: $\widehat{\mu}_1 = \frac{1}{m} \sum_{j=1}^m \overline{y}_j$. \pause \item $E(\widehat{\mu}_1) = \mu$, so it's unbiased. \pause \item $Var(\widehat{\mu}_1) = \frac{\sigma^2}{m^2}\sum_{j=1}^m \frac{1}{n_j}$. \pause Can we do better? \pause \item Noting that $\widehat{\mu}_1 = \sum_{j=1}^m \frac{1}{m}\overline{y}_j$ is a linear combination of the $\overline{y}_j$ with the weights adding to one \ldots \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Try Weighted Least Squares} %\framesubtitle{} \begin{itemize} \item[] $ \overline{y}_j = \mu + \epsilon_j$ with $E(\epsilon_j) = 0$ and $Var(\epsilon_j) = \frac{\sigma^2}{n_j}$ \pause \item[] It's a regression with $\beta_0=\mu$ and no explanatory variables. \pause \end{itemize} \begin{displaymath} cov(\boldsymbol{\epsilon}) = \sigma^2\left( \begin{array}{cccc} \frac{1}{n_1} & 0 & \cdots & 0 \\ 0 & \frac{1}{n_2} & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & \frac{1}{n_m} \end{array} \right) = \sigma^2 \mathbf{V} \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Scalar Calculation} %\framesubtitle{} \begin{eqnarray*} \overline{y}_j & = & \mu + \epsilon_j \\ \pause \Longrightarrow \sqrt{n_j} \, \overline{y}_j & = & \sqrt{n_j} \, \mu + \sqrt{n_j} \, \epsilon_j \\ \pause % &&\\ y_j^* &=& x_j^*\beta_1^* + \epsilon_j^* \end{eqnarray*} \pause \begin{itemize} \item It's another regression model. \item This time there is no intercept, and $\mu$ is the slope. \pause \end{itemize} \begin{eqnarray*} Var(\epsilon_j^*) & = & Var(\sqrt{n_j} \, \epsilon_j) \\ \pause & = & n_j Var(\epsilon_j) \\ \pause & = & n_j \frac{\sigma^2}{n_j} \\ \pause & = & \sigma^2 \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Least Squares for Simple Regression through the Origin} \framesubtitle{$y_j^* = x_j^*\beta_1^* + \epsilon_j^*$, with $\beta_1^*=\mu$, $y_j^* = \sqrt{n_j} \, \overline{y}_j$ and $x_j^* = \sqrt{n_j}$} \pause \begin{eqnarray*} \widehat{\beta_1}^* & = & \frac{\sum_{j=1}^m x_j^*y_j^*}{\sum_{j=1}^m x_j^{*2}} \\ \pause & = & \frac{\sum_{j=1}^m \sqrt{n_j} \,\sqrt{n_j} \, \overline{y}_j} {\sum_{j=1}^m \sqrt{n_j}^2} \\ \pause & = & \frac{\sum_{j=1}^m n_j \overline{y}_j} {\sum_{j=1}^m n_j} \\ \pause & = & \sum_{j=1}^m \left( \frac{n_j} {\sum_{\ell=1}^m n_\ell} \right) \overline{y}_j \end{eqnarray*} \pause \begin{itemize} \item A linear combination of the $\overline{y}_j$; the weights add to one. \pause \item B.L.U.E. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{And not only that, but \ldots} %\framesubtitle{} \begin{eqnarray*} \widehat{\beta_1}^* & = & \frac{\sum_{j=1}^m n_j \overline{y}_j} {\sum_{j=1}^m n_j} \\ \pause & = & \frac{\sum_{j=1}^m n_j \frac{\sum_{i=1}^{n_j}y_{ij}}{n_j}} {\sum_{j=1}^m n_j} \\ \pause & = & \frac{\sum_{j=1}^m \sum_{i=1}^{n_j} y_{ij}} {\sum_{j=1}^m n_j} \pause = \frac{\sum_{j=1}^m \sum_{i=1}^{n_j} y_{ij}}{n} \end{eqnarray*} \pause % \vspace{2mm} \begin{itemize} \item So the B.L.U.E. of $\mu$ is just the sample mean of all the data. \pause \item One more comment is that $\widehat{\boldsymbol{\beta}}^* = (\mathbf{X}^\prime \mathbf{V}^{-1} \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{V}^{-1} \mathbf{y}$ yields the same expression. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f20} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f20}} \end{frame} \end{document} & = & \frac{\sum_{j=1}^m \sum_{i=1}^{n_j}y_{ij} {\sum_{j=1}^m n_j} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Generating residual plot showing unequal variance rm(list=ls()) sigsqx = 4; mux = 0 sigsq = 10; b0 = 10; b1 = 1 nsim = 200 set.seed(9999) n = round(runif(nsim,5,200)); hist(n) x = rnorm(nsim,mux,sqrt(sigsqx)); hist(x) epsilon = rnorm(nsim,0,sqrt(sigsq/n)) y = b0 + b1*x + epsilon; plot(x,y) mod = lm(y~x); summary(mod) Residual = residuals(mod) plot(n,Residual, xlab = 'Number of Students', main='Residuals by Number of Students in Class') %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \mathbf{V}^{-\frac{1}{2}} \mathbf{X} % \mathbf{V}^{-1} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame}