% \documentclass[serif]{beamer} % Serif for Computer Modern math font.
\documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements
\hypersetup{colorlinks,linkcolor=,urlcolor=red}

\usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice!
\setbeamertemplate{navigation symbols}{} % Suppress navigation symbols
% \usetheme{Berlin} % Displays sections on top
\usetheme{Frankfurt}  % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides
%\usetheme{Berkeley}

\usepackage[english]{babel}
\usepackage{amsmath} % for binom
% \usepackage{graphicx} % To include pdf files!
% \definecolor{links}{HTML}{2A1B81}
% \definecolor{links}{red}
\setbeamertemplate{footline}[frame number] 

\mode<presentation>

\title{Random Vectors\footnote{See last slide for copyright information.}}
\subtitle{STA442/2101 Fall 2018}
\date{} % To suppress date

\begin{document}


\begin{frame}
  \titlepage
\end{frame}

\begin{frame}
\frametitle{Background Reading: Renscher and Schaalje's \emph{Linear models in statistics}}
       \begin{itemize}
            \item Chapter 3 on Random Vectors and Matrices
            \item Chapter 4 on the Multivariate Normal Distribution
       \end{itemize}
\end{frame}

\begin{frame}
\frametitle{Overview}
\tableofcontents
\end{frame}

\section{Definitions and Basic Results}

\begin{frame}
\frametitle{Random Vectors and Matrices}
%\framesubtitle{} 
A \emph{random matrix} is just a matrix of random variables. Their joint probability distribution is the distribution of the random matrix. Random matrices with just one column (say, $p \times 1$) may be called \emph{random vectors}.
\end{frame}

\begin{frame}
\frametitle{Expected Value}
%\framesubtitle{} 

The expected value of a matrix is defined as the matrix of expected values. \pause Denoting the $p \times c$ random matrix $\mathbf{X}$ by $[X_{i,j}]$, \pause 
\begin{displaymath}
    E(\mathbf{X}) = [E(X_{i,j})].
\end{displaymath}
\end{frame}

\begin{frame}
\frametitle{Immediately we have natural properties like} \pause
%\framesubtitle{} 
\begin{eqnarray}
    E(\mathbf{X}+\mathbf{Y})  \pause
                             &=& E([X_{i,j}]+[Y_{i,j}]) \nonumber \\ \pause 
                             &=& [E(X_{i,j}+Y_{i,j})] \nonumber \\ \pause 
                             &=& [E(X_{i,j})+E(Y_{i,j})] \nonumber \\ \pause 
                             &=& [E(X_{i,j})]+[E(Y_{i,j})] \nonumber \\ \pause 
                             &=& E(\mathbf{X})+E(\mathbf{Y}). \nonumber 
\end{eqnarray}
\end{frame}

\begin{frame}
\frametitle{Moving a constant through the expected value sign} \pause
Let $\mathbf{A} = [a_{i,j}]$ be an $r \times p$ matrix of constants, while $\mathbf{X}$ is still a $p \times c$ random matrix. \pause  Then 
\begin{eqnarray}
    E(\mathbf{AX})  \pause 
        &=& E\left(\left[\sum_{k=1}^p a_{i,k}X_{k,j}\right]\right) \nonumber \\ \pause 
        &=& \left[E\left(\sum_{k=1}^p a_{i,k}X_{k,j}\right)\right] \nonumber \\ \pause 
        &=& \left[\sum_{k=1}^p a_{i,k}E(X_{k,j})\right] \nonumber \\ \pause 
        &=& \mathbf{A}E(\mathbf{X}). \nonumber \pause 
\end{eqnarray}

Similar calculations yield $E(\mathbf{AXB}) = \mathbf{A}E(\mathbf{X})\mathbf{B}$.
\end{frame}

\begin{frame}
\frametitle{Variance-Covariance Matrices} \pause 
Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}$. \pause  The \emph{variance-covariance matrix} of $\mathbf{X}$  \pause (sometimes just called the \emph{covariance matrix})\pause, denoted by $cov(\mathbf{X})$, is defined as \pause 
\begin{displaymath}
    cov(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu})
                             (\mathbf{X}-\boldsymbol{\mu})^\top\right\}.
\end{displaymath}
\end{frame}


\begin{frame}
\frametitle{$cov(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu})
                             (\mathbf{X}-\boldsymbol{\mu})^\top\right\}$} \pause
{\scriptsize
\begin{eqnarray}
   cov(\mathbf{X}) &=& E\left\{
        \left( \begin{array}{c}
        X_1-\mu_1 \\  X_2-\mu_2 \\ X_3-\mu_3
        \end{array} \right)
        \left( \begin{array}{c c c}
        X_1-\mu_1 &  X_2-\mu_2 & X_3-\mu_3
        \end{array} \right) \right\} \nonumber \\ \pause
  &=& E\left\{
        \left( \begin{array}{l l l}
        (X_1-\mu_1)^2 &  (X_1-\mu_1)(X_2-\mu_2) & (X_1-\mu_1)(X_3-\mu_3) \\
        (X_2-\mu_2)(X_1-\mu_1) &  (X_2-\mu_2)^2 & (X_2-\mu_2)(X_3-\mu_3) \\
        (X_3-\mu_3)(X_1-\mu_1) &  (X_3-\mu_3)(X_2-\mu_2) & (X_3-\mu_3)^2 \\
        \end{array} \right) \right\} \nonumber \\
         \nonumber \\ \pause
  &=& 
    \left( \begin{array}{l l l}
    E\{(X_1-\mu_1)^2\} &  E\{(X_1-\mu_1)(X_2-\mu_2)\} & E\{(X_1-\mu_1)(X_3-\mu_3)\} \\
    E\{(X_2-\mu_2)(X_1-\mu_1)\} &  E\{(X_2-\mu_2)^2\} & E\{(X_2-\mu_2)(X_3-\mu_3)\} \\
    E\{(X_3-\mu_3)(X_1-\mu_1)\} &  E\{(X_3-\mu_3)(X_2-\mu_2)\} & E\{(X_3-\mu_3)^2\} \\
        \end{array} \right)  \nonumber \\
         \nonumber \\ \pause
  &=& 
    \left( \begin{array}{l l l}
    Var(X_1) &  Cov(X_1,X_2) & Cov(X_1,X_3) \\
    Cov(X_1,X_2) &  Var(X_2) & Cov(X_2,X_3) \\
    Cov(X_1,X_3) &  Cov(X_2,X_3) & Var(X_3) \\
        \end{array} \right) . \nonumber \\
         \nonumber 
\end{eqnarray} 
\pause 
So, the covariance matrix $cov(\mathbf{X})$ is a $p \times p$ symmetric matrix with variances on the main diagonal and covariances on the off-diagonals. 
} %V End size
\end{frame}


\begin{frame}
\frametitle{Matrix of covariances between two random vectors} \pause
Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}_x$ and let $\mathbf{Y}$ be a $q \times 1$ random vector with $E(\mathbf{Y}) = \boldsymbol{\mu}_y$. \pause The $p \times q$ matrix of covariances between the elements of $\mathbf{X}$ and the elements of $\mathbf{Y}$ \pause is
\begin{displaymath}
    cov(\mathbf{X,Y}) \pause = E\left\{ (\mathbf{X}-\boldsymbol{\mu}_x)
                             (\mathbf{Y}-\boldsymbol{\mu}_y)^\top\right\}.
\end{displaymath}
\end{frame}


\begin{frame}
\frametitle{Adding a constant has no effect}
\framesubtitle{On variances and covariances} \pause
  \begin{itemize}
    \item $ cov(\mathbf{X} + \mathbf{a}) = cov(\mathbf{X})$ \pause
    \item $cov(\mathbf{X} + \mathbf{a},\mathbf{Y} + \mathbf{b})
        = cov(\mathbf{X},\mathbf{Y})$ \pause
  \end{itemize}

\vspace{10mm}

These results are clear from the definitions: \pause
  \begin{itemize}
    \item $cov(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu})
                             (\mathbf{X}-\boldsymbol{\mu})^\top\right\}$ \pause
    \item $cov(\mathbf{X,Y}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}_x)
                             (\mathbf{Y}-\boldsymbol{\mu}_y)^\top\right\}$
  \end{itemize} \pause

\vspace{10mm}

Sometimes it is useful to let $\mathbf{a} = -\boldsymbol{\mu}_x$ and 
                                 $\mathbf{b} = -\boldsymbol{\mu}_y$.
\end{frame}

\begin{frame}
\frametitle{Analogous to $Var(a\,X) = a^2\,Var(X)$} \pause
Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}$ and $cov(\mathbf{X}) = \boldsymbol{\Sigma}$\pause, while $\mathbf{A} = [a_{i,j}]$ is an $r \times p$ matrix of constants. \pause Then
\begin{eqnarray*} \label{vax}
    cov(\mathbf{AX})  \pause
    &=& 
    E\left\{ (\mathbf{AX}-\mathbf{A}\boldsymbol{\mu})
             (\mathbf{AX}-\mathbf{A}\boldsymbol{\mu})^\top \right\}  \\ \pause
    &=& 
    E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu})
             \left(\mathbf{A}(\mathbf{X}-\boldsymbol{\mu})\right)^\top 
             \right\}  \\ \pause
    &=& 
    E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu})
             (\mathbf{X}-\boldsymbol{\mu})^\top \mathbf{A}^\top
             \right\} \nonumber \\ \pause
    &=&      \mathbf{A}E\{(\mathbf{X}-\boldsymbol{\mu})
             (\mathbf{X}-\boldsymbol{\mu})^\top\} \mathbf{A}^\top
              \\ \pause
    &=&      \mathbf{A}cov(\mathbf{X}) \mathbf{A}^\top \nonumber \\ \pause
    &=&      \mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top 
\end{eqnarray*}
\end{frame}

\section{Multivariate Normal}

\begin{frame}
\frametitle{The Multivariate Normal Distribution} \pause
The $p \times 1$ random vector $\mathbf{X}$ is said to have a \emph{multivariate normal distribution}, \pause and we write $\mathbf{X} \sim N_p(\boldsymbol{\mu},\boldsymbol{\Sigma})$, \pause if $\mathbf{X}$ has (joint) density \pause

\begin{displaymath}
f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} 
                \exp\left( -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right),
\end{displaymath} \pause
where $\boldsymbol{\mu}$ is $p \times 1$ and $\boldsymbol{\Sigma}$ is $p \times p$ symmetric and positive definite.
\end{frame}


\begin{frame}
\frametitle{$\boldsymbol{\Sigma}$ positive definite}
\framesubtitle{In the multivariate normal definition} \pause
  \begin{itemize}
    \item Positive definite means that for any non-zero $p \times 1$ vector $\mathbf{a}$, we have $\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a} > 0$. \pause
    \item Since the one-dimensional random variable $Y=\sum_{i=1}^p a_i X_i$ may be written as $Y=\mathbf{a}^\top \mathbf{X}$ \pause and $Var(Y)=cov(\mathbf{a}^\top \mathbf{X})=\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a}$\pause, it is natural to require that $\boldsymbol{\Sigma}$ be positive definite.  \pause
    \item All it means is that every non-zero linear combination of $\mathbf{X}$ values has a positive variance. \pause
    \item And recall $\boldsymbol{\Sigma}$ positive definite is equivalent to $\boldsymbol{\Sigma}^{-1}$ positive definite.
  \end{itemize}
\end{frame}

\begin{frame}
\frametitle{Analogies}
\framesubtitle{(Multivariate normal reduces to the univariate normal when $p=1$)} \pause
\begin{itemize}
    \item Univariate Normal
        \begin{itemize}
            \item $f(x) =  \frac{1}{\sigma \sqrt{2\pi}} 
                  \exp \left\{-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}\right\}$
            \item $E(X)=\mu, Var(X) = \sigma^2$ 
            \item $\frac{(X-\mu)^2}{\sigma^2} \sim \chi^2 (1)$
        \end{itemize} \pause

\vspace{3mm}

    \item Multivariate Normal
        \begin{itemize}
            \item $f(\mathbf{x}) = 
            \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} 
                \exp\left\{ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right\}$
            \item $E(\mathbf{X})= \boldsymbol{\mu}$, $cov(\mathbf{X}) = \boldsymbol{\Sigma}$
            \item $(\mathbf{X}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$
        \end{itemize}
\end{itemize}
\end{frame} 


\begin{frame}
\frametitle{More properties of the multivariate normal} \pause
% 
  \begin{itemize}
    \item If $\mathbf{c}$ is a vector of constants, \pause $\mathbf{X}+\mathbf{c} \sim
             N(\mathbf{c}+\boldsymbol{\mu},\boldsymbol{\Sigma})$ \pause   
    \item If $\mathbf{A}$ is a matrix of constants, \pause $\mathbf{AX} \sim
             N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top)$ \pause
    \item Linear combinations of multivariate normals are multivariate normal. \pause
    \item All the marginals (dimension less than $p$) of $\mathbf{X}$ are (multivariate) normal\pause, but it is possible in theory to have a collection of univariate normals whose joint distribution is not multivariate normal. \pause
    \item For the multivariate normal, zero covariance implies independence. \pause The multivariate normal is the only continuous distribution with this property. 
  \end{itemize}
\end{frame}


\begin{frame}
\frametitle{An easy example}
\framesubtitle{If you do it the easy way} \pause
Let $\mathbf{X}= (X_1,X_2,X_3)^\top$ be multivariate normal with \pause
    \begin{displaymath}
    \boldsymbol{\mu} = 
    \left( \begin{array}{c} 1 \\ 0 \\ 6
    \end{array} \right) \mbox{ and }
    \boldsymbol{\Sigma} =
     \left( \begin{array}{c c c}
                 2 & 1 & 0 \\
                 1 & 4 & 0 \\
                 0 & 0 & 2
    \end{array} \right) .
    \end{displaymath} \pause
Let $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$. Find the joint distribution of $Y_1$ and $Y_2$.
% Just for fun, check it with sage:
%mu = vector(QQ,[1,0,6]).column() # QQ is the rational field
%Sigma = matrix(QQ,[[2,1,0],[1,4,0],[0,0,2]])
%A = matrix(QQ,[[1,1,0],[0,1,1]])
%mu2 = A*mu; show(mu2)
%Sigma2 = A*Sigma*A.transpose(); show(Sigma2)
\end{frame}

\begin{frame}
\frametitle{In matrix terms} \pause
$Y_1=X_1+X_2$ and $Y_2=X_2+X_3$ means $\mathbf{Y} = \mathbf{AX}$ \pause

\vspace{10mm}

\begin{displaymath}
      \left( \begin{array}{c} Y_1 \\ Y_2
      \end{array} \right)
    = \left( \begin{array}{c c c}
                 1 & 1 & 0 \\
                 0 & 1 & 1 
      \end{array} \right)
      \left( \begin{array}{c} X_1 \\ X_2 \\ X_3
      \end{array} \right)
\end{displaymath} \pause

\vspace{10mm}

$\mathbf{Y} = \mathbf{AX} \sim
             N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top)$
\end{frame}

\begin{frame}[fragile]
\frametitle{You could do it by hand, but} \pause
%\framesubtitle{} 
\begin{verbatim}
> mu = cbind(c(1,0,6))
> Sigma = rbind( c(2,1,0),
+                c(1,4,0),
+                c(0,0,2) )
> A = rbind( c(1,1,0),
+            c(0,1,1) ); A
> A %*% mu              # E(Y)
     [,1]
[1,]    1
[2,]    6
> A %*% Sigma %*% t(A)  # cov(Y)
     [,1] [,2]
[1,]    8    5
[2,]    5    6
\end{verbatim} 
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Regression}
%\framesubtitle{} 

\begin{itemize}
    \item[] $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$, with 
$\boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2\mathbf{I}_n)$. \pause
    \item[] So $\mathbf{y} \sim  N_n(\mathbf{X}\boldsymbol{\beta},\sigma^2\mathbf{I}_n)$. \pause
    \item[] $\widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} 
                   \mathbf{X}^\top \mathbf{y} \pause = \mathbf{Ay}$. \pause 
    \item[] So $\widehat{\boldsymbol{\beta}}$ is multivariate normal. \pause
    \item[] Just calculate the mean and covariance matrix. \pause
\end{itemize}
% \vspace{3mm}
\begin{eqnarray*}
    E(\widehat{\boldsymbol{\beta}}) \pause
    &=& E\left((\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \right) \\ \pause
    &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top E(\mathbf{y}) \\ \pause
    &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{X}\boldsymbol{\beta} \\ \pause
    &=& \boldsymbol{\beta}
\end{eqnarray*} 

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Covariance matrix of $\widehat{\boldsymbol{\beta}}$}
\framesubtitle{Using $cov(\mathbf{Aw}) = \mathbf{A}cov(\mathbf{w}) \mathbf{A}^\top$} 
\begin{eqnarray*}
    cov(\widehat{\boldsymbol{\beta}}) \pause
    &=& cov\left((\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} \right) \\ \pause
    &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top {\color{red} cov(\mathbf{y}) }
         \left( (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \right)^\top \\ \pause
    &=& (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top {\color{red} \sigma^2\mathbf{I}_n}
        \mathbf{X} (\mathbf{X}^\top \mathbf{X})^{-1\top}  \\ \pause
    &=& \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top\mathbf{X}
        (\mathbf{X}^\top \mathbf{X})^{-1}  \\ \pause
    &=& \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1}
\end{eqnarray*} 
\vspace{5mm}\pause

{\Large
So $\widehat{\boldsymbol{\beta}} \sim N_p\left(\boldsymbol{\beta}, 
                   \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1}\right)$
} % End size
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{A couple of things to prove}
  \begin{itemize}
    \item $(\mathbf{X}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$

\vspace{10mm}

    \item $\overline{X}$ and $S^2$ independent under normal random sampling.
  \end{itemize}
\end{frame}


\begin{frame}
\frametitle{Recall the square root matrix} \pause
Covariance matrix $\boldsymbol{\Sigma}$ is real and symmetric matrix, so we have the spectral decomposition \pause


\begin{eqnarray*}
    \boldsymbol{\Sigma} & = & \mathbf{P}\boldsymbol{\Lambda}\mathbf{P}^\top \\ \pause
     & = &  \mathbf{P}\boldsymbol{\Lambda}^{1/2}\boldsymbol{\Lambda}^{1/2}           
            \mathbf{P}^\top \\ \pause
     & = &  \mathbf{P}\boldsymbol{\Lambda}^{1/2} \, \mathbf{I} \,
            \boldsymbol{\Lambda}^{1/2} \mathbf{P}^\top \\ \pause
     & = &  \mathbf{P}\boldsymbol{\Lambda}^{1/2} \mathbf{P}^\top~~\mathbf{P}
            \boldsymbol{\Lambda}^{1/2} \mathbf{P}^\top \\ \pause
     & = &  ~~~\boldsymbol{\Sigma}^{1/2}~~~~~~~\boldsymbol{\Sigma}^{1/2}
\end{eqnarray*}  \pause

\vspace{10mm}

So $\boldsymbol{\Sigma}^{1/2} = \mathbf{P}\boldsymbol{\Lambda}^{1/2} \mathbf{P}^\top$
\end{frame}

\begin{frame}
\frametitle{Square root of an inverse}
\framesubtitle{Positive definite $\Rightarrow$ Positive eigenvalues $\Rightarrow$ Inverse exists} \pause
     $\mathbf{P}\boldsymbol{\Lambda}^{-1/2}\mathbf{P}^\top ~\cdot~
       \mathbf{P}\boldsymbol{\Lambda}^{-1/2}\mathbf{P}^\top
       = \mathbf{P}\boldsymbol{\Lambda}^{-1} \mathbf{P}^\top
       = \boldsymbol{\Sigma}^{-1}$,  \pause

\vspace{5mm}

so

\vspace{5mm}

    $\left(\boldsymbol{\Sigma}^{-1} \right)^{1/2}
    = \mathbf{P}\boldsymbol{\Lambda}^{-1/2} \mathbf{P}^\top$. \pause

\vspace{10mm}


It's easy to show
  \begin{itemize}
    \item $\left(\boldsymbol{\Sigma}^{-1} \right)^{1/2}$ is the inverse of  
    $\boldsymbol{\Sigma}^{1/2}$ \pause
    \item Justifying the notation $\boldsymbol{\Sigma}^{-1/2}$
  \end{itemize}
\end{frame}

\begin{frame}
\frametitle{Now we can show $(\mathbf{X}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$} 
\framesubtitle{Where $\mathbf{X} \sim N(\boldsymbol{\mu},\boldsymbol{\Sigma})$}
\pause

\begin{eqnarray*}
    \mathbf{Y} = \mathbf{X}-\boldsymbol{\mu} 
    & \sim & N\left(\mathbf{0},\ \boldsymbol{\Sigma}\right) \\ \pause
    \mathbf{Z} = \boldsymbol{\Sigma}^{-\frac{1}{2}} \mathbf{Y} 
    & \sim &  N\left(\mathbf{0},
     \boldsymbol{\Sigma}^{-\frac{1}{2}} \boldsymbol{\Sigma} 
     \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ \pause
    & = & N\left(\mathbf{0},
     \boldsymbol{\Sigma}^{-\frac{1}{2}} 
     \boldsymbol{\Sigma}^{\frac{1}{2}} ~ \boldsymbol{\Sigma}^{\frac{1}{2}}
     \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ \pause
    & = & N\left(\mathbf{0}, \mathbf{I}\right) \pause
\end{eqnarray*} 

So $\mathbf{Z}$ is a vector of $p$ independent standard normals\pause, and 
\begin{displaymath}
    \mathbf{Y}^\top \boldsymbol{\Sigma}^{-1} \mathbf{Y}  \pause
    = \mathbf{Z}^\top \mathbf{Z} \pause = \sum_{j=1}^p Z_i^2 \pause
    \sim \chi^2(p) \pause   ~~~~~~~~~~ \blacksquare
\end{displaymath}
\end{frame}



\begin{frame}
\frametitle{$\overline{X}$ and $S^2$ independent }
Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} N(\mu,\sigma^2)$.  \pause
\begin{displaymath}
\begin{array}{lcl}
\mathbf{X} = \left( 
             \begin{array}{c}
               X_1 \\ \vdots \\ X_n
             \end{array}
             \right) \sim   N\left(\mu\mathbf{1},\sigma^2\mathbf{I} \right) 
&~~~~& \pause
\mathbf{Y} = \left( 
             \begin{array}{c}
               X_1-\overline{X} \\ \vdots \\  X_{n-1}-\overline{X} \\\\ \overline{X}
             \end{array}
             \right) \pause
           = \mathbf{AX}
\end{array}
\end{displaymath}
\end{frame}

\begin{frame}
\frametitle{$\mathbf{Y} = \mathbf{AX}$}
\framesubtitle{In more detail} 
\begin{displaymath}
%     \mathbf{AX} = 
    \left(
    \begin{array}{rrcrr}
    1-\frac{1}{n} &  -\frac{1}{n} &   \cdots   &  -\frac{1}{n} &  -\frac{1}{n} \\
                  &               &            &               &               \\
     -\frac{1}{n} & 1-\frac{1}{n} & \cdots     & -\frac{1}{n}  &  -\frac{1}{n} \\
        \vdots\,  &    \vdots\,   & \vdots\,   &     \vdots\,  &    \vdots\,   \\
     -\frac{1}{n} &  -\frac{1}{n} &  \cdots    &1-\frac{1}{n}  & -\frac{1}{n}  \\
                  &               &            &               &               \\
      \frac{1}{n} &   \frac{1}{n} &  \cdots    &   \frac{1}{n} & \frac{1}{n}   \\
    \end{array}
    \right)
             \left( 
             \begin{array}{c}
               X_1 \\ \\ X_2 \\ \vdots  \\ X_{n-1} \\ \\ X_n
             \end{array}
             \right)
     =       \left( 
             \begin{array}{c}
               X_1-\overline{X} \\ \\  X_2-\overline{X} \\
               \vdots \\  X_{n-1}-\overline{X} \\\\ \overline{X}
             \end{array}
             \right)
\end{displaymath} 
\end{frame}


\begin{frame}
\frametitle{The argument}
\begin{displaymath}
\mathbf{Y} = \mathbf{AX} = \left( 
             \begin{array}{c}
               X_1-\overline{X} \\ \vdots \\  X_{n-1}-\overline{X} \\\\ \overline{X}
             \end{array}
             \right)
            = \left( 
             \begin{array}{c}
        \\\\ \mathbf{Y}_2 \\\\ \hline \\ \overline{X}
             \end{array}
             \right)
\end{displaymath} \pause
  \begin{itemize}
    \item $\mathbf{Y}$ is multivariate normal. \pause
    \item $Cov\left(\overline{X},(X_j-\overline{X})\right)=0$ (Exercise) \pause
    \item So $\overline{X}$ and $\mathbf{Y}_2$ are independent. \pause
    \item So $\overline{X}$ and $S^2 = g(\mathbf{Y}_2)$ are independent. ~~$\blacksquare$
  \end{itemize}
\end{frame}

\begin{frame}
\frametitle{Leads to the $t$ distribution} \pause
%\framesubtitle{} 
If
  \begin{itemize}
    \item $Z \sim N(0,1)$ and \pause
    \item $Y \sim \chi^2(\nu)$ and  \pause
    \item $Z$ and $Y$ are independent, then \pause
  \end{itemize}
\begin{displaymath}
    T = \frac{Z}{\sqrt{Y/\nu}} \pause \sim t(\nu)
\end{displaymath}
\end{frame}

\begin{frame}
\frametitle{Random sample from a normal distribution} \pause
Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} N(\mu,\sigma^2)$. \pause Then
  \begin{itemize}
    \item $\frac{\sqrt{n}(\overline{X}-\mu)}{\sigma} \pause 
          = \frac{(\overline{X}-\mu)}{\sigma/\sqrt{n}} \pause \sim N(0,1)$ \pause and
    \item $\frac{(n-1)S^2}{\sigma^2} \pause \sim \chi^2(n-1)$ \pause and 
    \item These quantities are independent\pause, so \pause
\begin{eqnarray*}
    T & = & \frac{\sqrt{n}(\overline{X}-\mu)/\sigma}
                {\sqrt{\frac{(n-1)S^2}{\sigma^2}/(n-1)}} \\ \pause
      &&\\
      & = & \frac{\sqrt{n}(\overline{X}-\mu)}{S} \pause \sim t(n-1) 
\end{eqnarray*}
  \end{itemize}
\end{frame}



% Last in section
\begin{frame} 
\frametitle{Multivariate normal likelihood}
\framesubtitle{For reference}  \pause
{\footnotesize
\begin{eqnarray*}
    L(\boldsymbol{\mu,\Sigma}) &=& 
    \prod_{i=1}^n \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} 
                \exp\left\{ -\frac{1}{2} (\mathbf{x}_i-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})\right\} \\ \pause
&&\\
                               &=&
    |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} 
    \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) +
    (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} 
    (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\}\pause,
\end{eqnarray*} }

where $\boldsymbol{\widehat{\Sigma}} = 
\frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) 
                        (\mathbf{x}_i-\overline{\mathbf{x}})^\top $ 
is the sample variance-covariance matrix.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Delta Method}

\begin{frame}
\frametitle{The Multivarite Delta Method}
\framesubtitle{An application} \pause
The univariate delta method says that if $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$, \pause then $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} 
     g^\prime(\theta) \, T$. \pause
\vspace{3mm}

In the multivariate delta method, $\mathbf{T}_n$ and $\mathbf{T}$ are $d$-dimensional random vectors. \pause
\vspace{3mm}

The function $g: \mathbb{R}^d \rightarrow \mathbb{R}^k$ \pause is a vector of functions: \pause
\begin{displaymath}
    g(x_1, \ldots, x_d) = \pause \left( 
    \begin{array}{c}
    g_1(x_1, \ldots, x_d) \\ \vdots \\  g_k(x_1, \ldots, x_d)
    \end{array}
  \right)
\end{displaymath} \pause
\vspace{3mm}
$g^\prime(\theta)$ is replaced by a matrix of partial derivatives (a Jacobian): \pause
\begin{center}
\.{g}$(x_1, \ldots, x_d) = 
\left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ \pause like
$\left(\begin{array}{ccc}
\frac{\partial g_1}{\partial x_1} & \frac{\partial g_1}{\partial x_2} & \frac{\partial g_1}{\partial x_3} \\
\frac{\partial g_2}{\partial x_1} & \frac{\partial g_2}{\partial x_2} & \frac{\partial g_2}{\partial x_3} \\
\end{array}\right)$.
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{The Delta Method}
\framesubtitle{Univariate and multivariate} 
The univariate delta method says that if $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$, then $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} 
     g^\prime(\theta) \, T$. \pause \vspace{5mm}

The multivariate delta method says that if $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T}$,  then 
$\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} 
\mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{T}$, \pause \vspace{5mm}

where \.{g}$(x_1, \ldots, x_d) = 
\left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$  \pause \vspace{5mm}

In particular, if $ \mathbf{T} \sim N(\mathbf{0},\mathbf{\Sigma})$\pause, then 
\begin{center}
$\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \pause
\mathbf{Y} \sim N(\mathbf{0},
\mbox{\.{g}}(\boldsymbol{\theta})\mathbf{\Sigma}\mbox{\.{g}}(\boldsymbol{\theta}) ^\top)$.
\end{center}
\end{frame}

\begin{frame}
\frametitle{Testing a non-linear hypothesis}
%\framesubtitle{} 
Consider the regression model $y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \epsilon_i$. \pause 
  \begin{itemize}
    \item[] There is a standard $F$-test for $H_0: \mathbf{L}\boldsymbol{\beta} = \mathbf{h}$. \pause 
    \item[] So testing whether $\beta_1=0$ \underline{and} $\beta_2=0$ is easy. \pause 
    \item[] But what about testing whether $\beta_1=0$ \underline{or} $\beta_2=0$ (or both)? \pause 
    \item[] If $H_0: \beta_1\beta_2 = 0$ is rejected, it means that \emph{both} regression coefficients are non-zero. \pause 
    \item[] Can't test non-linear null hypotheses like this with standard tools. \pause 
    \item[] But if the sample size is large we can use the delta method.   
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{The asymptotic distribution of $\widehat{\beta}_1\widehat{\beta}_2$} \pause
%\framesubtitle{} 
{\small
The multivariate delta method says that if $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T}$,  then 
$\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} 
\mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{T}$, \pause \vspace{3mm}

Know $\widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} 
                   \mathbf{X}^\top \mathbf{y} \sim N_p\left(\boldsymbol{\beta}, 
                   \sigma^2 (\mathbf{X}^\top \mathbf{X})^{-1}\right)$.
\pause \vspace{3mm}

So $\sqrt{n}(\widehat{\boldsymbol{\beta}}_n-\boldsymbol{\beta}) \stackrel{d}{\rightarrow} \mathbf{T}
\sim N(\mathbf{0},\boldsymbol{\Sigma})$\pause, where 
$\boldsymbol{\Sigma} = \pause \lim_{n \rightarrow \infty}  
\sigma^2 \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1}$. 
\pause \vspace{3mm}

Let $g(\boldsymbol{\beta}) = \beta_1\beta_2$. Have \pause
\begin{eqnarray*}
    & = & \sqrt{n}(g(\widehat{\boldsymbol{\beta}}_n)-g(\boldsymbol{\beta})) \\ \pause
    & = & \sqrt{n}( \widehat{\beta}_1\widehat{\beta}_2 - \beta_1\beta_2) \\ \pause
    & \stackrel{d}{\rightarrow} & \mbox{\.{g}} (\boldsymbol{\beta}) \mathbf{T} \\ \pause
    & = & T \pause \sim \pause
    N(0,\mbox{\.{g}}(\boldsymbol{\beta}) \boldsymbol{\Sigma}
    \mbox{\.{g}}(\boldsymbol{\beta})^\top)
\end{eqnarray*} \pause
We will say $\widehat{\beta}_1\widehat{\beta}_2$ is asymptotically 
$N\left(\beta_1\beta_2,\frac{1}{n}\mbox{\.{g}}(\boldsymbol{\beta}) \boldsymbol{\Sigma}
    \mbox{\.{g}}(\boldsymbol{\beta})^\top\right)$. \pause \vspace{4mm}

Need  \.{g}$(\boldsymbol{\beta})$.
} % End size
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{\.{g}$(x_1, \ldots, x_d) = 
\left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$} \pause
%\framesubtitle{} 
$g(\beta_0,\beta_1,\beta_2) = \beta_1\beta_2$ \pause so $d=3$ and $k=1$. \pause

\begin{eqnarray*}
    \mbox{\.{g}}(\beta_0,\beta_1,\beta_2) 
    & = & (\frac{\partial g}{\partial \beta_0}, \frac{\partial g}{\partial \beta_1},
     \frac{\partial g}{\partial \beta_2}) \\ \pause
    & = & (0, \pause \beta_2, \pause \beta_1 ) 
\end{eqnarray*} \pause
\vspace{5mm}

So $\widehat{\beta}_1\widehat{\beta}_2 \stackrel{\cdot}{\sim} \pause N\left(\beta_1\beta_2,\frac{1}{n}(0, \beta_2, \beta_1 ) \boldsymbol{\Sigma}
    \left(\begin{array}{c}
    0 \\ \beta_2 \\  \beta_1
    \end{array}  \right)
  \right)$.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Need the standard error} \pause
%\framesubtitle{} 
We have $\widehat{\beta}_1\widehat{\beta}_2 \stackrel{\cdot}{\sim} N\left(\beta_1\beta_2,\frac{1}{n}(0, \beta_2, \beta_1 ) \boldsymbol{\Sigma}
    \left(\begin{array}{c}
    0 \\ \beta_2 \\  \beta_1
    \end{array}  \right)
  \right)$.
\pause \vspace{5mm}

  \begin{itemize}
    \item[] Denote the asymptotic variance by $\frac{1}{n}(0, \beta_2, \beta_1 ) \boldsymbol{\Sigma}
    \left(\begin{array}{c}
    0 \\ \beta_2 \\  \beta_1
    \end{array}  \right) \pause = v$. \pause
    \item[] If we knew $v$ \pause we could compute $Z = \frac{\widehat{\beta}_1\widehat{\beta}_2 - \beta_1\beta_2}{\sqrt{v}}$ \pause
    \item[] And use it in tests and confidence intervals. \pause
    \item[] Need to estimate $v$.
  \end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Standard error}
\framesubtitle{Estimated standard deviation of $\widehat{\beta}_1\widehat{\beta}_2$} \pause 
\begin{displaymath}
    v = \frac{1}{n}(0, \beta_2, \beta_1 ) \boldsymbol{\Sigma}
    \left(\begin{array}{c}
    0 \\ \beta_2 \\  \beta_1
    \end{array}  \right)
\end{displaymath}  \pause
where $\boldsymbol{\Sigma} = \lim_{n \rightarrow \infty}  
\sigma^2 \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1}$. \pause
  \begin{itemize}
    \item[] Estimate $\beta_1$ and $\beta_2$ with $\widehat{\beta}_1$ and $\widehat{\beta}_2$ \pause 
    \item[] Estimate $\sigma^2$ with $MSE = \mathbf{e}^\top\mathbf{e}/(n-p)$. \pause 
    \item[] Approximate $\frac{1}{n}\boldsymbol{\Sigma}$ with  \pause 
  \end{itemize}
\begin{eqnarray*}
    MSE \frac{1}{n} 
    \left(\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1} \pause
    & = &  MSE  
    \left(n\frac{1}{n}\mathbf{X}^\top \mathbf{X}\right)^{-1}  \\ \pause
    & = &  MSE \left(\mathbf{X}^\top \mathbf{X}\right)^{-1}
\end{eqnarray*}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{$\widehat{v}$ approximates $v$} \pause
%\framesubtitle{} 
{\LARGE
\begin{eqnarray*}
    v &=& 
      \frac{1}{n}(0, \beta_2, \beta_1 ) \boldsymbol{\Sigma}
    \left(\begin{array}{c}
    0 \\ \beta_2 \\  \beta_1
    \end{array}  \right) \\ \pause
     &~&\\
    \widehat{v} &=& MSE \,
    (0, \widehat{\beta_2}, \widehat{\beta_1} ) 
    \left(\mathbf{X}^\top \mathbf{X}\right)^{-1}
    \left(\begin{array}{c}
    0 \\ \widehat{\beta_2} \\  \widehat{\beta_1}
    \end{array}  \right)
\end{eqnarray*} 
} % End size
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Test statistic for $H_0:\beta_1\beta_2=0$} \pause
%\framesubtitle{} 
{\LARGE
\begin{displaymath}
    Z = \frac{\widehat{\beta}_1\widehat{\beta}_2 - 0}{\sqrt{\widehat{v}}}
\end{displaymath} \pause
} % End size
where
\begin{displaymath}
    \widehat{v} = 
    (0, \widehat{\beta_2}, \widehat{\beta_1} ) 
    {\color{red} MSE
    \left(\mathbf{X}^\top \mathbf{X}\right)^{-1}
    } % End color
    \left(\begin{array}{c}
    0 \\ \widehat{\beta_2} \\  \widehat{\beta_1}
    \end{array}  \right)
\end{displaymath} \pause
\vspace{5mm}

Note $MSE\left(\mathbf{X}^\top \mathbf{X}\right)^{-1}$ is produced by R's \texttt{vcov} function.
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{frame}
\frametitle{Copyright Information}

This slide show was prepared by  \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner},
Department of Statistics, University of Toronto. It is licensed under a 
\href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US}
     {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website:
\href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf18} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf18}}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} 
    \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) +
    (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} 
    (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{}
%\framesubtitle{} 
  \begin{itemize}
    \item 
    \item 
    \item 
  \end{itemize}
\end{frame}

{\LARGE
\begin{displaymath}
    
\end{displaymath} }

  \begin{itemize}
    \item[]  \pause 
    \item[]  \pause 
    \item[]  \pause 
    \item[]  \pause 
    \item[]  \pause 
    \item[]  \pause 
    \item[]  \pause 
  \end{itemize}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%