% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{comment} \usepackage{alltt} % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Random Vectors\footnote{See last slide for copyright information.}} \subtitle{STA2053 Fall 2022} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Definitions and Basic Results} \begin{frame} \frametitle{Random Vectors and Matrices} %\framesubtitle{} A \emph{random matrix} is just a matrix of random variables. Their joint probability distribution is the distribution of the random matrix. Random matrices with just one column (say, $p \times 1$) may be called \emph{random vectors}. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Expected Value} %\framesubtitle{} The expected value of a matrix is defined as the matrix of expected values. \pause Denoting the $p \times c$ random matrix $\mathbf{X}$ by $[X_{i,j}]$, \pause \begin{displaymath} E(\mathbf{X}) = [E(X_{i,j})]. \end{displaymath} \end{frame} \begin{frame} \frametitle{Immediately we have natural properties like} %\framesubtitle{} \begin{eqnarray} E(\mathbf{X}+\mathbf{Y}) \pause &=& E([X_{i,j}]+[Y_{i,j}]) \nonumber \\ \pause &=& [E(X_{i,j}+Y_{i,j})] \nonumber \\ \pause &=& [E(X_{i,j})+E(Y_{i,j})] \nonumber \\ \pause &=& [E(X_{i,j})]+[E(Y_{i,j})] \nonumber \\ \pause &=& E(\mathbf{X})+E(\mathbf{Y}). \nonumber \end{eqnarray} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Moving a constant through the expected value sign} \pause Let $\mathbf{A} = [a_{i,j}]$ be an $r \times p$ matrix of constants, while $\mathbf{X}$ is still a $p \times c$ random matrix. \pause Then \begin{eqnarray} E(\mathbf{AX}) \pause &=& E\left(\left[\sum_{k=1}^p a_{i,k}X_{k,j}\right]\right) \nonumber \\ \pause &=& \left[E\left(\sum_{k=1}^p a_{i,k}X_{k,j}\right)\right] \nonumber \\ \pause &=& \left[\sum_{k=1}^p a_{i,k}E(X_{k,j})\right] \nonumber \\ \pause &=& \mathbf{A}E(\mathbf{X}). \nonumber \pause \end{eqnarray} Similar calculations yield $E(\mathbf{AXB}) = \mathbf{A}E(\mathbf{X})\mathbf{B}$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Variance-Covariance Matrices} \pause Let $\mathbf{x}$ be a $p \times 1$ random vector with $E(\mathbf{x}) = \boldsymbol{\mu}$. \pause The \emph{variance-covariance matrix} of $\mathbf{x}$ (sometimes just called the \emph{covariance matrix}), denoted by $cov(\mathbf{x})$, is defined as \pause \begin{displaymath} cov(\mathbf{x}) = E\left\{ (\mathbf{x}-\boldsymbol{\mu}) (\mathbf{x}-\boldsymbol{\mu})^\top\right\}. \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$cov(\mathbf{x}) = E\left\{ (\mathbf{x}-\boldsymbol{\mu}) (\mathbf{x}-\boldsymbol{\mu})^\top\right\}$} {\scriptsize \begin{eqnarray} cov(\mathbf{x}) &=& E\left\{ \left( \begin{array}{c} X_1-\mu_1 \\ X_2-\mu_2 \\ X_3-\mu_3 \end{array} \right) \left( \begin{array}{c c c} X_1-\mu_1 & X_2-\mu_2 & X_3-\mu_3 \end{array} \right) \right\} \nonumber \\ \pause &=& E\left\{ \left( \begin{array}{l l l} (X_1-\mu_1)^2 & (X_1-\mu_1)(X_2-\mu_2) & (X_1-\mu_1)(X_3-\mu_3) \\ (X_2-\mu_2)(X_1-\mu_1) & (X_2-\mu_2)^2 & (X_2-\mu_2)(X_3-\mu_3) \\ (X_3-\mu_3)(X_1-\mu_1) & (X_3-\mu_3)(X_2-\mu_2) & (X_3-\mu_3)^2 \\ \end{array} \right) \right\} \nonumber \\ \nonumber \\ \pause &=& \left( \begin{array}{l l l} E\{(X_1-\mu_1)^2\} & E\{(X_1-\mu_1)(X_2-\mu_2)\} & E\{(X_1-\mu_1)(X_3-\mu_3)\} \\ E\{(X_2-\mu_2)(X_1-\mu_1)\} & E\{(X_2-\mu_2)^2\} & E\{(X_2-\mu_2)(X_3-\mu_3)\} \\ E\{(X_3-\mu_3)(X_1-\mu_1)\} & E\{(X_3-\mu_3)(X_2-\mu_2)\} & E\{(X_3-\mu_3)^2\} \\ \end{array} \right) \nonumber \\ \nonumber \\ \pause &=& \left( \begin{array}{l l l} Var(X_1) & Cov(X_1,X_2) & Cov(X_1,X_3) \\ Cov(X_1,X_2) & Var(X_2) & Cov(X_2,X_3) \\ Cov(X_1,X_3) & Cov(X_2,X_3) & Var(X_3) \\ \end{array} \right) . \nonumber \\ \nonumber \end{eqnarray} \pause So, the covariance matrix $cov(\mathbf{x})$ is a $p \times p$ symmetric matrix with variances on the main diagonal and covariances on the off-diagonals. } %V End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Matrix of covariances between two random vectors} \pause Let $\mathbf{x}$ be a $p \times 1$ random vector with $E(\mathbf{x}) = \boldsymbol{\mu}_x$ and let $\mathbf{y}$ be a $q \times 1$ random vector with $E(\mathbf{y}) = \boldsymbol{\mu}_y$. \pause The $p \times q$ matrix of covariances between the elements of $\mathbf{x}$ and the elements of $\mathbf{y}$ \pause is \begin{displaymath} cov(\mathbf{x,y}) \pause = E\left\{ (\mathbf{x}-\boldsymbol{\mu}_x) (\mathbf{y}-\boldsymbol{\mu}_y)^\top\right\}. \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Adding a constant has no effect} \framesubtitle{On variances and covariances} \pause \begin{itemize} \item $ cov(\mathbf{x} + \mathbf{a}) = cov(\mathbf{x})$ \pause \item $cov(\mathbf{x} + \mathbf{a},\mathbf{y} + \mathbf{b}) = cov(\mathbf{x},\mathbf{y})$ \pause \end{itemize} \vspace{10mm} These results are clear from the definitions: \begin{itemize} \item $cov(\mathbf{x}) = E\left\{ (\mathbf{x}-\boldsymbol{\mu}) (\mathbf{x}-\boldsymbol{\mu})^\top\right\}$ \item $cov(\mathbf{x,y}) = E\left\{ (\mathbf{x}-\boldsymbol{\mu}_x) (\mathbf{y}-\boldsymbol{\mu}_y)^\top\right\}$ \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Analogous to $Var(a\,X) = a^2\,Var(X)$} \pause Let $\mathbf{x}$ be a $p \times 1$ random vector with $E(\mathbf{x}) = \boldsymbol{\mu}$ and $cov(\mathbf{x}) = \boldsymbol{\Sigma}$\pause, while $\mathbf{A} = [a_{i,j}]$ is an $r \times p$ matrix of constants. \pause Then \begin{eqnarray*} \label{vax} cov(\mathbf{Ax}) \pause &=& E\left\{ (\mathbf{Ax}-\mathbf{A}\boldsymbol{\mu}) (\mathbf{Ax}-\mathbf{A}\boldsymbol{\mu})^\top \right\} \\ \pause &=& E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu}) \left(\mathbf{A}(\mathbf{X}-\boldsymbol{\mu})\right)^\top \right\} \\ \pause &=& E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu}) (\mathbf{x}-\boldsymbol{\mu})^\top \mathbf{A}^\top \right\} \nonumber \\ \pause &=& \mathbf{A}E\left\{(\mathbf{x}-\boldsymbol{\mu}) (\mathbf{x}-\boldsymbol{\mu})^\top\right\} \mathbf{A}^\top \\ \pause &=& \mathbf{A}cov(\mathbf{x}) \mathbf{A}^\top \nonumber \\ \pause &=& \mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Multivariate Normal} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Multivariate Normal Distribution} \pause The $p \times 1$ random vector $\mathbf{x}$ is said to have a \emph{multivariate normal distribution}, and we write $\mathbf{x} \sim N_p(\boldsymbol{\mu},\boldsymbol{\Sigma})$, if $\mathbf{x}$ has (joint) density \pause \begin{displaymath} f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left( -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right), \end{displaymath} \pause where $\boldsymbol{\mu}$ is $p \times 1$ and $\boldsymbol{\Sigma}$ is $p \times p$ symmetric and positive definite. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\boldsymbol{\Sigma}$ positive definite} \framesubtitle{In the multivariate normal definition} \pause \begin{itemize} \item Positive definite means that for any non-zero $p \times 1$ vector $\mathbf{a}$, we have $\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a} > 0$. \pause \item Since the one-dimensional random variable $Y=\sum_{i=1}^p a_i X_i$ may be written as $Y=\mathbf{a}^\top \mathbf{x}$ \pause and $Var(Y)=cov(\mathbf{a}^\top \mathbf{x})=\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a}$\pause, it is natural to require that $\boldsymbol{\Sigma}$ be positive definite. \pause \item All it means is that every non-zero linear combination of $\mathbf{x}$ values has a positive variance. \pause \item And recall $\boldsymbol{\Sigma}$ positive definite is equivalent to $\boldsymbol{\Sigma}^{-1}$ positive definite. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Analogies} \framesubtitle{(Multivariate normal reduces to the univariate normal when $p=1$)} \pause \begin{itemize} \item Univariate Normal \begin{itemize} \item $f(x) = \frac{1}{\sigma \sqrt{2\pi}} \exp \left\{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}\right\}$ \item $E(X)=\mu, Var(X) = \sigma^2$ \item $\frac{(X-\mu)^2}{\sigma^2} \sim \chi^2 (1)$ \end{itemize} \pause \vspace{3mm} \item Multivariate Normal \begin{itemize} \item $f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left\{ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right\}$ \item $E(\mathbf{x})= \boldsymbol{\mu}$, $cov(\mathbf{x}) = \boldsymbol{\Sigma}$ \item $(\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu}) \sim \chi^2 (p)$ \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{More properties of the multivariate normal} \pause % \begin{itemize} \item If $\mathbf{c}$ is a vector of constants, $\mathbf{x}+\mathbf{c} \sim N(\mathbf{c}+\boldsymbol{\mu},\boldsymbol{\Sigma})$ \pause \item If $\mathbf{A}$ is a matrix of constants, \pause $\mathbf{Ax} \sim N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top)$ \pause \item Linear combinations of multivariate normals are multivariate normal. \pause \item All the marginals (dimension less than $p$) of $\mathbf{x}$ are (multivariate) normal\pause, but it is possible in theory to have a collection of univariate normals whose joint distribution is not multivariate normal. \pause \item For the multivariate normal, zero covariance implies independence. \pause The multivariate normal is the only continuous distribution with this property. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{An easy example} \framesubtitle{If you do it the easy way} \pause Let $\mathbf{x}= (X_1,X_2,X_3)^\top$ be multivariate normal with \pause \begin{displaymath} \boldsymbol{\mu} = \left( \begin{array}{c} 1 \\ 0 \\ 6 \end{array} \right) \mbox{ and } \boldsymbol{\Sigma} = \left( \begin{array}{c c c} 2 & 1 & 0 \\ 1 & 4 & 0 \\ 0 & 0 & 2 \end{array} \right) . \end{displaymath} \pause Let $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$. Find the joint distribution of $Y_1$ and $Y_2$. % Just for fun, check it with sage: %mu = vector(QQ,[1,0,6]).column() # QQ is the rational field %Sigma = matrix(QQ,[[2,1,0],[1,4,0],[0,0,2]]) %A = matrix(QQ,[[1,1,0],[0,1,1]]) %mu2 = A*mu; show(mu2) %Sigma2 = A*Sigma*A.transpose(); show(Sigma2) \end{frame} \begin{frame} \frametitle{In matrix terms} \pause $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$ means $\mathbf{y} = \mathbf{Ax}$ \pause \vspace{10mm} \begin{displaymath} \left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) = \left( \begin{array}{c c c} 1 & 1 & 0 \\ 0 & 1 & 1 \end{array} \right) \left( \begin{array}{c} X_1 \\ X_2 \\ X_3 \end{array} \right) \end{displaymath} \pause \vspace{10mm} $\mathbf{y} = \mathbf{Ax} \sim N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top)$ \end{frame} \begin{frame}[fragile] \frametitle{You could do it by hand, but} \pause %\framesubtitle{} \begin{verbatim} > mu = cbind(c(1,0,6)) > Sigma = rbind( c(2,1,0), + c(1,4,0), + c(0,0,2) ) > A = rbind( c(1,1,0), + c(0,1,1) ); A > A %*% mu # E(Y) [,1] [1,] 1 [2,] 6 > A %*% Sigma %*% t(A) # cov(Y) [,1] [,2] [1,] 8 5 [2,] 5 6 \end{verbatim} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Regression} %\framesubtitle{} \begin{itemize} \item[] $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$, with $\boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2\mathbf{I}_n)$. \pause \item[] So $\mathbf{y} \sim N_n(\mathbf{X}\boldsymbol{\beta},\sigma^2\mathbf{I}_n)$. \pause \item[] $\widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \pause = \mathbf{Ay}$. \pause \item[] So $\widehat{\boldsymbol{\beta}}$ is multivariate normal. \pause \item[] Just calculate the mean and covariance matrix. \pause \end{itemize} % \vspace{3mm} \begin{eqnarray*} E(\widehat{\boldsymbol{\beta}}) &=& E\left((\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \right) \\ \pause &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top E(\mathbf{y}) \\ \pause &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{X}\boldsymbol{\beta} \\ \pause &=& \boldsymbol{\beta} \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Covariance matrix of $\widehat{\boldsymbol{\beta}}$} \framesubtitle{Using $cov(\mathbf{Aw}) = \mathbf{A}cov(\mathbf{w}) \mathbf{A}^\top$} \begin{eqnarray*} cov(\widehat{\boldsymbol{\beta}}) &=& cov\left((\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \right) \\ \pause &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top {\color{red} cov(\mathbf{y}) } \left( (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \right)^\top \\ \pause &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top {\color{red} \sigma^2\mathbf{I}_n} \mathbf{X} (\mathbf{X}^\top \mathbf{X})^{-1\top} \\ \pause &=& \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top\mathbf{X} (\mathbf{X}^\top \mathbf{X})^{-1} \\ \pause &=& \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1} \end{eqnarray*} \vspace{5mm}\pause {\Large So $\widehat{\boldsymbol{\beta}} \sim N_p\left(\boldsymbol{\beta}, \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1}\right)$. } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example: showing $(\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu}) \sim \chi^2 (p)$} \framesubtitle{Where $\mathbf{x} \sim N(\boldsymbol{\mu},\boldsymbol{\Sigma})$} \pause \begin{eqnarray*} \mathbf{y} = \mathbf{x}-\boldsymbol{\mu} & \sim & N\left(\mathbf{0},\ \boldsymbol{\Sigma}\right) \\ \pause \mathbf{z} = \boldsymbol{\Sigma}^{-\frac{1}{2}} \mathbf{y} & \sim & N\left(\mathbf{0}, \boldsymbol{\Sigma}^{-\frac{1}{2}} \boldsymbol{\Sigma} \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ \pause & = & N\left(\mathbf{0}, \boldsymbol{\Sigma}^{-\frac{1}{2}} \boldsymbol{\Sigma}^{\frac{1}{2}} ~ \boldsymbol{\Sigma}^{\frac{1}{2}} \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ \pause & = & N\left(\mathbf{0}, \mathbf{I}\right) \pause \end{eqnarray*} So $\mathbf{z}$ is a vector of $p$ independent standard normals\pause, and \begin{eqnarray*} (\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu}) & = & \mathbf{y}^\top \boldsymbol{\Sigma}^{-1} \mathbf{y} \\ \pause & = & \left( \boldsymbol{\Sigma}^{-\frac{1}{2}} \mathbf{y} \right)^\top \boldsymbol{\Sigma}^{-\frac{1}{2}} \mathbf{y} \\ \pause & = & \mathbf{z}^\top \mathbf{z} \\ \pause & = & \sum_{j=1}^p Z_j^2 \pause \sim \chi^2(p) ~~~ \blacksquare \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Last in section \begin{frame} \frametitle{Multivariate normal likelihood} \framesubtitle{For reference} \pause {\footnotesize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& \prod_{i=1}^n \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left\{ -\frac{1}{2} (\mathbf{x}_i-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})\right\} \\ \pause &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\}\pause, \end{eqnarray*} } where $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\top $ is the sample variance-covariance matrix. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Delta Method} \begin{frame} \frametitle{The Multivarite Delta Method} \framesubtitle{An application} \pause {\small The univariate delta method says that if $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$, \pause then $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T$. \pause For example, CLT yields $\sqrt{n}\left( \overline{X}_n- \mu \right) \stackrel{d}{\rightarrow} X\sim N(0,\sigma^2)$, \pause so $\sqrt{n}\left( g(\overline{X}_n)-g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu)X \sim N(0,\, g^\prime(\mu)^2 \, \sigma^2)$. \pause \vspace{3mm} In the multivariate delta method, $\mathbf{t}_n$ and $\mathbf{t}$ are $d$-dimensional random vectors. \pause \vspace{3mm} The function $g: \mathbb{R}^d \rightarrow \mathbb{R}^k$ is a vector of functions: \pause \begin{displaymath} g(x_1, \ldots, x_d) = \left( \begin{array}{c} g_1(x_1, \ldots, x_d) \\ \vdots \\ g_k(x_1, \ldots, x_d) \end{array} \right) \end{displaymath} \pause \vspace{3mm} $g^\prime(\theta)$ is replaced by a matrix of partial derivatives (a Jacobian): \pause \begin{center} \.{g}$(x_1, \ldots, x_d) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ \pause like $\left(\begin{array}{ccc} \frac{\partial g_1}{\partial x_1} & \frac{\partial g_1}{\partial x_2} & \frac{\partial g_1}{\partial x_3} \\ \frac{\partial g_2}{\partial x_1} & \frac{\partial g_2}{\partial x_2} & \frac{\partial g_2}{\partial x_3} \\ \end{array}\right)$. \end{center} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Delta Method} \framesubtitle{Univariate and multivariate} The univariate delta method says that if $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$, then $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T$. \vspace{5mm} The multivariate delta method says that if $\sqrt{n}(\mathbf{t}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{t}$, then $\sqrt{n}(g(\mathbf{t}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{t}$, \pause \vspace{5mm} where \.{g}$(x_1, \ldots, x_d) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ \pause \vspace{5mm} In particular, if $ \mathbf{t} \sim N(\mathbf{0},\mathbf{\Sigma})$\pause, then \begin{center} $\sqrt{n}(g(\mathbf{t}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mathbf{y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\theta})\,\boldsymbol{\Sigma}\,\mbox{\.{g}}(\boldsymbol{\theta}) ^\top)$. \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Testing a non-linear hypothesis} %\framesubtitle{} Consider the regression model $y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \epsilon_i$. \pause \begin{itemize} \item[] There is a standard $F$-test for $H_0: \mathbf{L}\boldsymbol{\beta} = \mathbf{h}$. \pause \item[] So testing whether $\beta_1=0$ \underline{and} $\beta_2=0$ is easy. \pause \item[] But what about testing whether $\beta_1=0$ \underline{or} $\beta_2=0$ (or both)? \pause \item[] If $H_0: \beta_1\beta_2 = 0$ is rejected, it means that \emph{both} regression coefficients are non-zero. \pause \item[] Can't test non-linear null hypotheses like this with standard tools. \pause \item[] But if the sample size is large we can use the delta method. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The asymptotic distribution of $\widehat{\beta}_1\widehat{\beta}_2$} \pause %\framesubtitle{} {\small The multivariate delta method says that if $\sqrt{n}(\mathbf{t}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{t}$, then $\sqrt{n}(g(\mathbf{t}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{t}$, \pause \vspace{3mm} Know $\widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \sim N_p\left(\boldsymbol{\beta}, \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1}\right)$. \pause \vspace{3mm} So $\sqrt{n}(\widehat{\boldsymbol{\beta}}_n-\boldsymbol{\beta}) \stackrel{d}{\rightarrow} \mathbf{t} \sim N(\mathbf{0},\boldsymbol{\Sigma})$\pause, where $\boldsymbol{\Sigma} = \pause \lim_{n \rightarrow \infty} \sigma^2 \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1}$. \pause \vspace{3mm} Let $g(\boldsymbol{\beta}) = \beta_1\beta_2$. Have \pause \begin{eqnarray*} & = & \sqrt{n}(g(\widehat{\boldsymbol{\beta}}_n)-g(\boldsymbol{\beta})) \\ \pause & = & \sqrt{n}( \widehat{\beta}_1\widehat{\beta}_2 - \beta_1\beta_2) \\ \pause & \stackrel{d}{\rightarrow} & \mbox{\.{g}} (\boldsymbol{\beta}) \mathbf{t} \\ \pause & = & T \pause \sim \pause N(0,\mbox{\.{g}}(\boldsymbol{\beta}) \boldsymbol{\Sigma} \mbox{\.{g}}(\boldsymbol{\beta})^\top) \end{eqnarray*} \pause We will say $\widehat{\beta}_1\widehat{\beta}_2$ is asymptotically $N\left(\beta_1\beta_2,\frac{1}{n}\mbox{\.{g}}(\boldsymbol{\beta}) \, \boldsymbol{\Sigma} \, \mbox{\.{g}}(\boldsymbol{\beta})^\top\right)$. \pause \vspace{4mm} Need \.{g}$(\boldsymbol{\beta})$. } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{\.{g}$(x_1, \ldots, x_d) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$} \pause %\framesubtitle{} $g(\beta_0,\beta_1,\beta_2) = \beta_1\beta_2$ \pause so $d=3$ and $k=1$. \pause \begin{eqnarray*} \mbox{\.{g}}(\beta_0,\beta_1,\beta_2) & = & (\frac{\partial g}{\partial \beta_0}, \frac{\partial g}{\partial \beta_1}, \frac{\partial g}{\partial \beta_2}) \\ \pause & = & (0, \pause \beta_2, \pause \beta_1 ) \end{eqnarray*} \pause \vspace{5mm} So $\widehat{\beta}_1\widehat{\beta}_2 \stackrel{\cdot}{\sim} \pause N\left(\beta_1\beta_2,~\frac{1}{n}(0, \beta_2, \beta_1 ) {\LARGE\boldsymbol{\Sigma}} \left(\begin{array}{c} 0 \\ \beta_2 \\ \beta_1 \end{array} \right) \right)$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Need the standard error} \pause %\framesubtitle{} We have $\widehat{\beta}_1\widehat{\beta}_2 \stackrel{\cdot}{\sim} N\left(\beta_1\beta_2,\frac{1}{n}(0, \beta_2, \beta_1 ) {\LARGE\boldsymbol{\Sigma}} \left(\begin{array}{c} 0 \\ \beta_2 \\ \beta_1 \end{array} \right) \right)$. \pause \vspace{5mm} \begin{itemize} \item[] Denote the asymptotic variance by $\frac{1}{n}(0, \beta_2, \beta_1 ) {\LARGE\boldsymbol{\Sigma}} \left(\begin{array}{c} 0 \\ \beta_2 \\ \beta_1 \end{array} \right) = v$. \pause \item[] If we knew $v$ we could compute $Z = \frac{\widehat{\beta}_1\widehat{\beta}_2 - \beta_1\beta_2}{\sqrt{v}}$ \item[] And use it in tests and confidence intervals. \pause \item[] Need to estimate $v$ consistently. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Standard error} \framesubtitle{Estimated standard deviation of $\widehat{\beta}_1\widehat{\beta}_2$} \pause \begin{displaymath} v = \frac{1}{n}(0, \beta_2, \beta_1 ) {\LARGE\boldsymbol{\Sigma}} \left(\begin{array}{c} 0 \\ \beta_2 \\ \beta_1 \end{array} \right) \end{displaymath} \pause where $\boldsymbol{\Sigma} = \lim_{n \rightarrow \infty} \sigma^2 \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1}$. \pause \begin{itemize} \item[] Estimate $\beta_1$ and $\beta_2$ with $\widehat{\beta}_1$ and $\widehat{\beta}_2$ \pause \item[] Estimate $\sigma^2$ with $MSE = \mathbf{e}^\top\mathbf{e}/(n-p)$. \pause \item[] Approximate $\frac{1}{n}\boldsymbol{\Sigma}$ with \pause \end{itemize} \begin{eqnarray*} \frac{1}{n} MSE \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1} \pause & = & MSE \left(n\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1} \\ \pause & = & MSE \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\widehat{v}$ approximates $v$} %\framesubtitle{} {\LARGE \begin{eqnarray*} v &=& \frac{1}{n}(0, \beta_2, \beta_1 ) {\LARGE\boldsymbol{\Sigma}} \left(\begin{array}{c} 0 \\ \beta_2 \\ \beta_1 \end{array} \right) \\ &~&\\ \widehat{v} &=& MSE \, (0, \widehat{\beta_2}, \widehat{\beta_1} ) \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} \left(\begin{array}{c} 0 \\ \widehat{\beta_2} \\ \widehat{\beta_1} \end{array} \right) \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Test statistic for $H_0:\beta_1\beta_2=0$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} Z = \frac{\widehat{\beta}_1\widehat{\beta}_2 - 0}{\sqrt{\widehat{v}}} \end{displaymath} \pause } % End size where \begin{displaymath} \widehat{v} = (0, \widehat{\beta_2}, \widehat{\beta_1} ) {\color{red} MSE \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} } % End color \left(\begin{array}{c} 0 \\ \widehat{\beta_2} \\ \widehat{\beta_1} \end{array} \right) \end{displaymath} \pause \vspace{5mm} Note {\color{red}$MSE\left(\mathbf{X}^\top \mathbf{X}\right)^{-1}$} is produced by R's \texttt{vcov} function. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Simulated Data} %\framesubtitle{} {\footnotesize % or scriptsize \begin{verbatim} rm(list=ls()); options(scipen=999) source('https://www.utstat.toronto.edu/brunner/Rfunctions/rmvn.txt') set.seed(9999) n = 200; sigma=1; beta0=4; beta1=0.2; beta2 = 0.1; phi12 = 0.5 Phi = rbind(c(1,phi12), c(phi12,1)) # Simulate epsilon = rnorm(n) X = rmvn(n,c(1,2),Phi) x1 = X[,1]; x2 = X[,2] y = beta0 + beta1*x1 + beta2*x2 + epsilon \end{verbatim} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Fit the Model} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> mod = lm(y ~ x1 + x2); summary(mod) } Call: lm(formula = y ~ x1 + x2) Residuals: Min 1Q Median 3Q Max -2.4491 -0.5762 -0.1361 0.6414 2.8680 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.04777 0.15188 26.651 <0.0000000000000002 *** x1 0.20145 0.08527 2.362 0.0191 * x2 0.09102 0.08482 1.073 0.2846 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.9879 on 197 degrees of freedom Multiple R-squared: 0.06584, Adjusted R-squared: 0.05636 F-statistic: 6.942 on 2 and 197 DF, p-value: 0.00122 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{$Z = \frac{\widehat{\beta}_1\widehat{\beta}_2 - 0}{\sqrt{\widehat{v}}}$} \framesubtitle{ $\widehat{v} = (0, \widehat{\beta_2}, \widehat{\beta_1} ) {\color{red} MSE \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} } % End color \left(\begin{array}{c} 0 \\ \widehat{\beta_2} \\ \widehat{\beta_1} \end{array} \right)$} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}betahat = coefficients(mod); betahat } (Intercept) x1 x2 4.04776866 0.20145026 0.09101697 {\color{blue}> gdot = rbind(c(0,betahat[3],betahat[2])); gdot } x2 x1 [1,] 0 0.09101697 0.2014503 {\color{blue}> Red = vcov(mod); Red } (Intercept) x1 x2 (Intercept) 0.023068331 0.001025739 -0.010024480 x1 0.001025739 0.007271354 -0.004035879 x2 -0.010024480 -0.004035879 0.007194646 {\color{blue}> vhat = as.numeric( gdot %*% Red %*% t(gdot) ) > z = betahat[2]*betahat[3]/sqrt(vhat); z } x1 1.283067 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/2053f22} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/2053f22}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} rm(list=ls()); options(scipen=999) source('https://www.utstat.toronto.edu/brunner/Rfunctions/rmvn.txt') set.seed(9999) n = 200; sigma=1; beta0=4; beta1=0.2; beta2 = 0.1; phi12 = 0.5 Phi = rbind(c(1,phi12), c(phi12,1)) # Simulate epsilon = rnorm(n) X = rmvn(n,c(1,2),Phi) x1 = X[,1]; x2 = X[,2] y = beta0 + beta1*x1 + beta2*x2 + epsilon mod = lm(y ~ x1 + x2); summary(mod) betahat = coefficients(mod); betahat gdot = rbind(c(0,betahat[3],betahat[2])); gdot Red = vcov(mod); Red vhat = as.numeric( gdot %*% Red %*% t(gdot) ) z = betahat[2]*betahat[3]/sqrt(vhat); z \end{comment} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } \begin{itemize} \item[] \pause \item[] \pause \item[] \pause \item[] \pause \item[] \pause \item[] \pause \item[] \pause \end{itemize} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%