% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols \usetheme{Berlin} % Displays sections on top % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} % \usetheme{AnnArbor} % CambridgeUS: Displays one section at a time. Good if there are a lot of sections or if they have long titles. \usepackage{tikz} % For the projection picture \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! \usepackage{comment} \usepackage{euscript} % for \EuScript \usepackage{graphpap} % \usepackage[scr=rsfs,cal=boondox]{mathalfa} % For \mathscr % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{More Properties of Least Squares Estimation\footnote{See last slide for copyright information.}} \subtitle{STA 302 Fall 2020} \date{} % To suppress date \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Reading in In Rencher and Schaalje's \emph{Linear Models In Statistics}} %\framesubtitle{} Much of this material is in Section 7.3.2 (pp. 145-149), except \begin{itemize} \item The Gauss-Markov Theorem is done better here. \item They discuss projections \emph{briefly} in Chapter 9. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Model: $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$} %\framesubtitle{} where \begin{itemize} \item[] $\mathbf{X}$ is an $n \times (k+1)$ matrix of observed constants with linearly independent columns. \item[] $\boldsymbol{\beta}$ is a $(k+1) \times 1$ matrix of unknown constants (parameters). \item[] $\boldsymbol{\epsilon}$ is an $n \times 1$ random vector with $E(\boldsymbol{\epsilon}) = \mathbf{0}$ and $cov(\boldsymbol{\epsilon}) = \sigma^2\mathbf{I}_n$. \item[] $\sigma^2$ is an unknown constant. \end{itemize} \pause Least squares estimator of $\boldsymbol{\beta}$ is {\LARGE \begin{displaymath} \widehat{\boldsymbol{\beta}} = (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{y} \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Unbiased Estimation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Unbiased Estimation} \framesubtitle{$\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$} {\LARGE \begin{eqnarray*} E\{\widehat{\boldsymbol{\beta}}\} \pause & = & E\{(\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{y}\} \\ \pause & = & (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime E\{\mathbf{y}\} \\ \pause & = & (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime ~ \mathbf{X}\boldsymbol{\beta} \\ \pause & = & \boldsymbol{\beta} \pause \end{eqnarray*} } % End size for any $\boldsymbol{\beta} \in \mathbb{R}^{k+1}$\pause, so $\widehat{\boldsymbol{\beta}}$ is an unbiased estimator of $\boldsymbol{\beta}$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Covariance matrix} \framesubtitle{Using $cov(\mathbf{Aw}) = \mathbf{A}cov(\mathbf{w}) \mathbf{A}^\prime$} {\LARGE \begin{eqnarray*} cov\left(\widehat{\boldsymbol{\beta}}\right) \pause & = & cov\left((\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{y}\right) \\ \pause & = & (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime cov(\mathbf{y}) \left( (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime \right)^\prime \\ \pause & = & (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime ~\sigma^2\mathbf{I}_n~ \mathbf{X}^{\prime\prime}(\mathbf{X}^\prime \mathbf{X})^{-1\prime} \\ \pause & = & \sigma^2(\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime \mathbf{X} (\mathbf{X}^\prime \mathbf{X})^{-1} \\ \pause & = & \sigma^2(\mathbf{X}^\prime \mathbf{X})^{-1} \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What are we estimating when we estimate $\boldsymbol{\beta}$?} \pause \framesubtitle{Human resources example: $y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 +\beta_3 x_3 + \epsilon$} \begin{itemize} \item $x_1 = $ University GPA. \item $x_2 = $ Job interview score. \item $x_3 = $ Test score. \item $y~ = $ Percent salary increase after one year. \end{itemize} \pause \begin{itemize} \item $E(y) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 +\beta_3 x_3$. \pause \item $\beta_1$, $\beta_2$ and $\beta_3$ are \emph{links} between predictor variables and (expected) response variable value. \pause \item $\beta_0$ is for curve fitting -- no interpretation in this example. \pause \item Question: Holding interview and test scores constant, how much does GPA matter? \pause \end{itemize} $E(y) = {\color{red}\beta_0+ \beta_2 x_2 +\beta_3 x_3} + \beta_1 x_1 $. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Gauss-Markov Theorem} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimating linear combinations of $\beta$ values} \framesubtitle{$y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 +\beta_3 x_3 + \epsilon$} {\LARGE \begin{displaymath} \ell_0\beta_0 + \ell_1\beta_1 + \cdots + \ell_k\beta_k \end{displaymath} \pause } % End size $x_1 = $ University GPA, $x_2 = $ Interview score, $x_3 = $ Test score. \\ \pause For fixed job interview score and test score, what's the connection between GPA and salary increase? \pause {\Large \begin{displaymath} \boldsymbol{\ell}^\prime \boldsymbol{\beta} = (0~~~1~~~0~~~0) \left(\begin{array}{c} \beta_0 \\ \beta_1 \\ \beta_2 \\ \beta_3 \end{array} \right) = \beta_1 \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another linear combination} %\framesubtitle{} What's the expected salary increase for a job candidate with a university GPA of 2.5, an interview score of 80\% and a test score of 70\%? \pause {\LARGE \begin{displaymath} \boldsymbol{\ell}^\prime \boldsymbol{\beta} = (1~~~2.5~~~80~~~70) \left(\begin{array}{c} \beta_0 \\ \beta_1 \\ \beta_2 \\ \beta_3 \end{array} \right) \end{displaymath} \pause } % End size Estimated expected value is often used for prediction. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} % 2017 notes p. 17 \frametitle{Natural Estimator} \pause %\framesubtitle{} \begin{itemize} \item Natural Estimator of $\boldsymbol{\ell}^\prime \boldsymbol{\beta}$ is $\boldsymbol{\ell}^\prime \widehat{\boldsymbol{\beta}}$. \pause \item It's unbiased: $E\{\boldsymbol{\ell}^\prime \widehat{\boldsymbol{\beta}}\} \pause = \boldsymbol{\ell}^\prime E\{\widehat{\boldsymbol{\beta}}\} \pause = \boldsymbol{\ell}^\prime \boldsymbol{\beta}$ \pause \item Small variance in an unbiased estimator is good. It's the variance of the \emph{sampling distribution}. \pause \end{itemize} \begin{center} \includegraphics[width=2.7in]{Variance-of-Estimator} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Linear Combination} \begin{itemize} \item The natural estimator of $\boldsymbol{\ell}^\prime \boldsymbol{\beta}$ is a linear combination of the $y_i$ values. %\framesubtitle{} {\Large \begin{displaymath} \boldsymbol{\ell}^\prime \widehat{\boldsymbol{\beta}} = {\color{blue}\boldsymbol{\ell}^\prime (\mathbf{X}^\prime \mathbf{X})^{-1} \mathbf{X}^\prime } \mathbf{y} \pause = {\color{blue} \mathbf{a}_0^\prime } \mathbf{y} \end{displaymath} \pause } % End size \item Let $L = a_1y_1 + a_2y_2 + \cdots + a_ny_n$\pause ~be another linear combination of $y_i$ with $E(L) = \boldsymbol{\ell}^\prime \boldsymbol{\beta}$ for every $\boldsymbol{\beta} \in \mathbb{R}^{k+1}$. \pause \item If we can find $L$, unbiased, with $Var(L)=stealth] % Arrow head type %\draw[help lines] (0,0) grid (8,6); % Draw each vector, and then the label. Golden ratio is around 1.6 = 8/5 \draw (-.5,0) -- (9,0) ; \draw (9,0) node[right] {$\EuScript{V}$}; \draw[thick, ->] (0,0) -- (0,5); \draw (0,5) node[above] {$\widehat{\boldsymbol{\epsilon}}$}; \draw[thick, ->] (0,0) -- (8,0); \draw (8,0) node[below] {$\widehat{\mathbf{y}}$}; \draw[thick, ->] (0,0) -- (8,5); \draw (8,5) node[above right=-3pt] {$\mathbf{y}$}; %\draw (8.25,5.1) node {$\mathbf{y}$}; \draw[dashed] (8,0) -- (8,5); \end{tikzpicture} \vspace{4mm} \pause $\widehat{\mathbf{y}} + \widehat{\boldsymbol{\epsilon}} = \widehat{\mathbf{y}} + (\mathbf{y}-\widehat{\mathbf{y}}) = \mathbf{y}$ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Projection Operator} \framesubtitle{$\mathbf{H} = \mathbf{X}(\mathbf{X}^\prime\mathbf{X})^{-1}\mathbf{X}^\prime$} \begin{itemize} \item $\widehat{\mathbf{y}}$ is the projection of $\mathbf{y}$ onto $\EuScript{V}$. \pause \item $\mathbf{H}$ is the projection operator: $\mathbf{Hy} = \widehat{\mathbf{y}}$. \pause \item $\mathbf{H}$ sends any point in $\mathbb{R}^n$ to $\EuScript{V}$. \pause \begin{itemize} \item[] $\mathbf{Hp} \pause = \mathbf{X} {\color{blue}(\mathbf{X}^\prime\mathbf{X})^{-1}\mathbf{X}^\prime \mathbf{p} } \pause = \mathbf{X} {\color{blue}\mathbf{b}}$. \pause \end{itemize} \item The projection is the closest point. \pause \item If $\mathbf{p} \in \EuScript{V}$ already, $\mathbf{Hp}= \mathbf{p}$. \pause \begin{itemize} \item[] $\mathbf{H{\color{blue}p}} = \mathbf{X} (\mathbf{X}^\prime\mathbf{X})^{-1}\mathbf{X}^\prime {\color{blue}\mathbf{Xb}} \pause = \mathbf{Xb} = \mathbf{p}$. \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Picture suggests $\widehat{\boldsymbol{\epsilon}} \perp \widehat{\mathbf{y}}$} %\framesubtitle{} \begin{columns} \column{0.5\textwidth} \begin{tikzpicture}[>=stealth, scale=1/2] % Arrow head type, scale %\draw[help lines] (0,0) grid (8,6); % Draw each vector, and then the label. Golden ratio is around 1.6 = 8/5 \draw (-.5,0) -- (9,0) ; \draw (9,0) node[right] {$\EuScript{V}$}; \draw[thick, ->] (0,0) -- (0,5); \draw (0,5) node[above] {$\widehat{\boldsymbol{\epsilon}}$}; \draw[thick, ->] (0,0) -- (8,0); \draw (8,0) node[below] {$\widehat{\mathbf{y}}$}; \draw[thick, ->] (0,0) -- (8,5); \draw (8,5) node[above right=-3pt] {$\mathbf{y}$}; %\draw (8.25,5.1) node {$\mathbf{y}$}; \draw[dashed] (8,0) -- (8,5); \end{tikzpicture} \pause \column{0.5\textwidth} \begin{itemize} \item In fact, $\widehat{\boldsymbol{\epsilon}} \perp \mathbf{v}$ for all $\mathbf{v} \in \EuScript{V}$. \pause \begin{eqnarray*} \mathbf{v}^{\prime\,} \widehat{\boldsymbol{\epsilon}} & = & (\mathbf{Xb})^{\prime\,} \widehat{\boldsymbol{\epsilon}} \\ \pause & = & \mathbf{b}^\prime\mathbf{X}^{\prime\,} \widehat{\boldsymbol{\epsilon}} \\ \pause & = & \mathbf{b}^\prime \mathbf{0} \pause = 0 \pause \end{eqnarray*} \item $\mathbf{v} \in \EuScript{V}$ includes \begin{itemize} \item $\widehat{\mathbf{y}} = \mathbf{X}\widehat{\boldsymbol{\beta}}$. \item $E(\mathbf{y}) = \mathbf{X}\boldsymbol{\beta}$. \item Every column of $\mathbf{X}$. \end{itemize} \end{itemize} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another way to arrive at the normal equations} \pause %\framesubtitle{} {\small \begin{columns} \column{0.5\textwidth} \begin{tikzpicture}[>=stealth, scale=1/2] % Arrow head type, scale %\draw[help lines] (0,0) grid (8,6); % Draw each vector, and then the label. Golden ratio is around 1.6 = 8/5 \draw (-.5,0) -- (9,0) ; \draw (9,0) node[right] {$\EuScript{V}$}; \draw[thick, ->] (0,0) -- (0,5); \draw (0,5) node[above] {$\widehat{\boldsymbol{\epsilon}}$}; \draw[thick, ->] (0,0) -- (8,0); \draw (8,0) node[below] {$\widehat{\mathbf{y}}$}; \draw[thick, ->] (0,0) -- (8,5); \draw (8,5) node[above right=-3pt] {$\mathbf{y}$}; %\draw (8.25,5.1) node {$\mathbf{y}$}; \draw[dashed] (8,0) -- (8,5); \end{tikzpicture} \pause \vspace{3mm} \begin{itemize} \item Least squares task is to minimize $Q = (\mathbf{y}-\mathbf{X}\boldsymbol{\beta})^\prime (\mathbf{y}-\mathbf{X}\boldsymbol{\beta})$. \pause \item Find the $\mathbf{X}\boldsymbol{\beta}$ point in $\EuScript{V}$ that is closest to $\mathbf{y}$. Call it $\mathbf{X}\widehat{\boldsymbol{\beta}}$. \pause \item Drop a perpendicular (normal) from $\mathbf{y}$ to $\EuScript{V}$. \pause \end{itemize} \column{0.5\textwidth} \begin{itemize} \item This perpendicular is parallel to $\mathbf{y}-\mathbf{X}\widehat{\boldsymbol{\beta}} = \widehat{\boldsymbol{\epsilon}}$. \pause \item So $\mathbf{y}-\mathbf{X}\widehat{\boldsymbol{\beta}}$ is at right angles to all basis vectors of $\EuScript{V}$. \pause Inner products are all zero. \pause \item That is, $\mathbf{X}^\prime (\mathbf{y}-\mathbf{X}\widehat{\boldsymbol{\beta}}) = \mathbf{0}$. \pause $\Rightarrow \mathbf{X}^\prime\mathbf{X}\widehat{\boldsymbol{\beta}} = \mathbf{X}^\prime \mathbf{y}$. \item These are the ``normal equations." \pause \item Wikipedia says ``In geometry, a normal is an object such as a line, ray, or vector that is perpendicular to a given object." \end{itemize} \end{columns} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f20} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f20}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document} % Possible homework % Maximum likelihood HW: sigma^2 as part b. % Expected value, covariance matrix of y-hat, epsilon-hat % What is the closest point in V to epsilon-hat? % Specializing ... \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } \begin{comment} \begin{displaymath} \left(\begin{array}{cccc} \end{array}\right) \end{displaymath} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{} \section{} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} # Picture of sampling distribution of ell-prime beta rm(list=ls()) x = seq(from=-3,to=3,length=500) y1 = dnorm(x,0,1); y2 = dnorm(x,0,1/2) plot(x,y2,type='l', ann=F, axes=F, ylim = c(-.1,.8)); lines(x,y1,lty=2) # axis(side=1, labels=F, at=c(-10,10)) # Below, no labels, tick marks off the map # Draw an exis line below xx = c(-4,4); yy = c(0,0); lines(xx,yy) text(0,-.05,'\U2113\U2032\U03B2') # ell prime beta in unicode # Picture of probability zero rm(list=ls()) x = seq(from=-3.5,to=3.5,length=500); y = dnorm(x); y2 = x*0 plot(x,y,type='l', ann=F, axes=F); lines(x,y2) # Then I stretched it by hand xx = c(1,1); yy = c(0,dnorm(1)); lines(xx,yy) \end{comment} # Generic scatterplot (not used yet this time) rm(list=ls()) n = 100; beta0 = 0; beta1 = 1; sigma = 10 set.seed(9999) x = rnorm(n,100,15); x = round(x) y = beta0 + beta1*x + rnorm(n,0,sigma); y = round(y) mod = lm(y ~ x); b = coefficients(mod) xx = c(min(x),max(x)); yy = b[1] + b[2]*xx tstring = expression(paste(hat(y),' = ',hat(beta)[0],' + ',hat(beta)[1],'x')) plot(x,y,main=tstring); lines(xx,yy) # Try drawing the residuals for(i in 1:n) { xx = c(x[i],x[i]); yhat = b[1] + b[2]*x[i]; yy = c(y[i],yhat) lines(xx,yy,lty=2) }