\documentclass[serif]{beamer} % Get Computer Modern math font. \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Random Vectors Part One\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Definitions and Basic Results} \begin{frame} \frametitle{Random Vectors and Matrices} %\framesubtitle{} A \emph{random matrix} is just a matrix of random variables. Their joint probability distribution is the distribution of the random matrix. Random matrices with just one column (say, $p \times 1$) may be called \emph{random vectors}. \end{frame} \begin{frame} \frametitle{Expected Value} %\framesubtitle{} The expected value of a matrix is defined as the matrix of expected values. Denoting the $p \times c$ random matrix $\mathbf{X}$ by $[X_{i,j}]$, \begin{displaymath} E(\mathbf{X}) = [E(X_{i,j})]. \end{displaymath} \end{frame} \begin{frame} \frametitle{Immediately we have natural properties like} %\framesubtitle{} \begin{eqnarray} E(\mathbf{X}+\mathbf{Y}) &=& E([X_{i,j}]+[Y_{i,j}]) \nonumber \\ &=& [E(X_{i,j}+Y_{i,j})] \nonumber \\ &=& [E(X_{i,j})+E(Y_{i,j})] \nonumber \\ &=& [E(X_{i,j})]+[E(Y_{i,j})] \nonumber \\ &=& E(\mathbf{X})+E(\mathbf{Y}). \nonumber \end{eqnarray} \end{frame} \begin{frame} \frametitle{Moving a constant through the expected value sign} Let $\mathbf{A} = [a_{i,j}]$ be an $r \times p$ matrix of constants, while $\mathbf{X}$ is still a $p \times c$ random matrix. Then \begin{eqnarray} E(\mathbf{AX}) &=& E\left(\left[\sum_{k=1}^p a_{i,k}X_{k,j}\right]\right) \nonumber \\ &=& \left[E\left(\sum_{k=1}^p a_{i,k}X_{k,j}\right)\right] \nonumber \\ &=& \left[\sum_{k=1}^p a_{i,k}E(X_{k,j})\right] \nonumber \\ &=& \mathbf{A}E(\mathbf{X}). \nonumber \end{eqnarray} Similar calculations yield $E(\mathbf{AXB}) = \mathbf{A}E(\mathbf{X})\mathbf{B}$. \end{frame} \begin{frame} \frametitle{Variance-Covariance Matrices} Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}$. The \emph{variance-covariance matrix} of $\mathbf{X}$ (sometimes just called the \emph{covariance matrix}), denoted by $V(\mathbf{X})$, is defined as \begin{displaymath} V(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime\right\}. \end{displaymath} \end{frame} \begin{frame} \frametitle{$V(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime\right\}$} {\scriptsize \begin{eqnarray} V(\mathbf{X}) &=& E\left\{ \left[ \begin{array}{c} X_1-\mu_1 \\ X_2-\mu_2 \\ X_3-\mu_3 \end{array} \right] \left[ \begin{array}{c c c} X_1-\mu_1 & X_2-\mu_2 & X_3-\mu_3 \end{array} \right] \right\} \nonumber \\ &=& E\left\{ \left[ \begin{array}{l l l} (X_1-\mu_1)^2 & (X_1-\mu_1)(X_2-\mu_2) & (X_1-\mu_1)(X_3-\mu_3) \\ (X_2-\mu_2)(X_1-\mu_1) & (X_2-\mu_2)^2 & (X_2-\mu_2)(X_3-\mu_3) \\ (X_3-\mu_3)(X_1-\mu_1) & (X_3-\mu_3)(X_2-\mu_2) & (X_3-\mu_3)^2 \\ \end{array} \right] \right\} \nonumber \\ \nonumber \\ &=& \left[ \begin{array}{l l l} E\{(X_1-\mu_1)^2\} & E\{(X_1-\mu_1)(X_2-\mu_2)\} & E\{(X_1-\mu_1)(X_3-\mu_3)\} \\ E\{(X_2-\mu_2)(X_1-\mu_1)\} & E\{(X_2-\mu_2)^2\} & E\{(X_2-\mu_2)(X_3-\mu_3)\} \\ E\{(X_3-\mu_3)(X_1-\mu_1)\} & E\{(X_3-\mu_3)(X_2-\mu_2)\} & E\{(X_3-\mu_3)^2\} \\ \end{array} \right] \nonumber \\ \nonumber \\ &=& \left[ \begin{array}{l l l} V(X_1) & Cov(X_1,X_2) & Cov(X_1,X_3) \\ Cov(X_1,X_2) & V(X_2) & Cov(X_2,X_3) \\ Cov(X_1,X_3) & Cov(X_2,X_3) & V(X_3) \\ \end{array} \right] . \nonumber \\ \nonumber \end{eqnarray} So, the covariance matrix $V(\mathbf{X})$ is a $p \times p$ symmetric matrix with variances on the main diagonal and covariances on the off-diagonals. } \end{frame} \begin{frame} \frametitle{Analogous to $Var(a\,X) = a^2\,Var(X)$} Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}$ and $V(\mathbf{X}) = \boldsymbol{\Sigma}$, while $\mathbf{A} = [a_{i,j}]$ is an $r \times p$ matrix of constants. Then \begin{eqnarray*} \label{vax} V(\mathbf{AX}) &=& E\left\{ (\mathbf{AX}-\mathbf{A}\boldsymbol{\mu}) (\mathbf{AX}-\mathbf{A}\boldsymbol{\mu})^\prime \right\} \\ &=& E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu}) \left(\mathbf{A}(\mathbf{X}-\boldsymbol{\mu})\right)^\prime \right\} \\ &=& E\left\{ \mathbf{A}(\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime \mathbf{A}^\prime \right\} \nonumber \\ &=& \mathbf{A}E\{(\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime\} \mathbf{A}^\prime \\ &=& \mathbf{A}V(\mathbf{X}) \mathbf{A}^\prime \nonumber \\ &=& \mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\prime \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Matrix of covariances between two random vectors} Let $\mathbf{X}$ be a $p \times 1$ random vector with $E(\mathbf{X}) = \boldsymbol{\mu}_x$ and let $\mathbf{Y}$ be a $q \times 1$ random vector with $E(\mathbf{Y}) = \boldsymbol{\mu}_y$. The $p \times q$ matrix of covariances between the elements of $\mathbf{X}$ and the elements of $\mathbf{Y}$ is \begin{displaymath} C(\mathbf{X,Y}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}_x) (\mathbf{Y}-\boldsymbol{\mu}_y)^\prime\right\}. \end{displaymath} \end{frame} \begin{frame} \frametitle{Adding a constant has no effect} \framesubtitle{On variances and covariances} It's clear from the definitions: \begin{itemize} \item $V(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime\right\}$ \item $C(\mathbf{X,Y}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}_x) (\mathbf{Y}-\boldsymbol{\mu}_y)^\prime\right\}$ \end{itemize} That \begin{itemize} \item $ V(\mathbf{X} + \mathbf{a}) = V(\mathbf{X})$ \item $C(\mathbf{X} + \mathbf{a},\mathbf{Y} + \mathbf{b}) = C(\mathbf{X},\mathbf{Y})$ \end{itemize} \vspace{5mm} For example, $E(\mathbf{X} + \mathbf{a}) = \boldsymbol{\mu} + \mathbf{a}$, so \begin{eqnarray*} V(\mathbf{X} + \mathbf{a}) & = & E\left\{ (\mathbf{X}+\mathbf{a}-(\boldsymbol{\mu}+\mathbf{a})) (\mathbf{X}+\mathbf{a}-(\boldsymbol{\mu}+\mathbf{a}))^\prime\right\} \\ & = & E\left\{ (\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\prime\right\} \\ & = & V(\mathbf{X}) \end{eqnarray*} \end{frame} \section{The Centering Rule} \begin{frame} \frametitle{The Centering Rule} \framesubtitle{Using $V(\mathbf{X} + \mathbf{a}) = V(\mathbf{X})$} Often, variance and covariance calculations can be simplified by subtracting off constants first. Denote the \emph{centered} version of $\mathbf{X}$ by $\stackrel{c}{\mathbf{X}} = \mathbf{X} - E(\mathbf{X})$, so that \begin{itemize} \item $E(\stackrel{c}{\mathbf{X}})=\mathbf{0}$ and \item $V(\stackrel{c}{\mathbf{X}}) = E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{X}} \stackrel{\prime}{\vphantom{~}}) = V(\mathbf{X})$ \end{itemize} The centering rule is a general version of this. \end{frame} \begin{frame} \frametitle{Linear combinations} %\framesubtitle{} \begin{eqnarray*} \mathbf{L} & = & \mathbf{A}_1\mathbf{X}_1 + \cdots + \mathbf{A}_m\mathbf{X}_m + \mathbf{b} \\ \stackrel{c}{\mathbf{L}} & = & \mathbf{A}_1 \stackrel{c}{\mathbf{X}}_1 + \cdots + \mathbf{A}_m \stackrel{c}{\mathbf{X}}_m,\mbox{ where} \\ \stackrel{c}{\mathbf{X}}_j & = & \mathbf{X}_j - E(\mathbf{X}_j) \mbox{ for } j=1,\ldots,m. \end{eqnarray*} \vspace{5mm} The centering rule says \vspace{5mm} \begin{eqnarray*} V(\mathbf{L}) & = & V(\stackrel{c}{\mathbf{L}}) \\ C(\mathbf{L}_1,\mathbf{L}_2) & = & C(\stackrel{c}{\mathbf{L}}_1,\stackrel{c}{\mathbf{L}}_2) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Example: $V(\mathbf{X}+\mathbf{Y})$} \framesubtitle{Using the centering rule} \begin{eqnarray*} V(\mathbf{X}+\mathbf{Y}) & = & V(\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}}) \\ & = & E(\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}}) (\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}})^\prime \\ & = & E(\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}}) (\stackrel{c}{\mathbf{X}}\stackrel{\prime}{\vphantom{~}} + \stackrel{c}{\mathbf{Y}}\stackrel{\prime}{\vphantom{~}}) \\ & = & E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{X}} \stackrel{\prime}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{Y}}\stackrel{c}{\mathbf{Y}} \stackrel{\prime}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{Y}} \stackrel{\prime}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{Y}}\stackrel{c}{\mathbf{X}} \stackrel{\prime}{\vphantom{~}}) \\ & = & V(\mathbf{X}) + V(\mathbf{Y}) + C(\mathbf{X},\mathbf{Y}) + C(\mathbf{Y},\mathbf{X}) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Example: $Cov(\overline{X},X_j-\overline{X}) = 0$} \framesubtitle{Scalar calculation using the centering rule} Let $X_1, \ldots, X_n$ be a random sample from a distribution with mean $\mu$ and variance $\sigma^2$. Since $\overline{X}$ and $X_j-\overline{X}$ are both linear combinations, {\small \begin{eqnarray*} Cov(\overline{X},X_j-\overline{X}) & = & Cov( \stackrel{c}{\overline{X}}, \stackrel{c}{X}_j-\stackrel{c}{\overline{X}}) \\ & = & E\left( \stackrel{c}{\overline{X}} (\stackrel{c}{X}_j-\stackrel{c}{\overline{X}})\right) \\ & = & E\left( \stackrel{c}{X}_j \stackrel{c}{\overline{X}}\right) - E\left(\stackrel{c}{\overline{X}}^2\right) \\ & = & E\left( \stackrel{c}{X}_j\frac{1}{n} \sum_{i=1}^n \stackrel{c}{X}_i\right) - Var\left(\stackrel{c}{\overline{X}}\right) \\ & = & {\color{red}E\left(\frac{1}{n} \sum_{i=1}^n \stackrel{c}{X}_i \stackrel{c}{X}_j\right) - Var\left(\overline{X}\right) }\\ % Continued on next slide \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Calculation continued} %{\tiny \begin{eqnarray*} & = & {\color{red}E\left(\frac{1}{n} \sum_{i=1}^n \stackrel{c}{X}_i \stackrel{c}{X}_j\right) - Var\left(\overline{X}\right) }\\ & = & \frac{1}{n} \sum_{i=1}^n E\left(\stackrel{c}{X}_i \stackrel{c}{X}_j\right) - \frac{\sigma^2}{n} \\ & = & \frac{1}{n} E\left(\stackrel{c}{X}_j^2\right) + \frac{1}{n} \sum_{i \neq j} E\left(\stackrel{c}{X}_i\right) E\left(\stackrel{c}{X}_j\right) - \frac{\sigma^2}{n} \\ & = & \frac{1}{n} Var\left(\stackrel{c}{X}_j\right) - \frac{\sigma^2}{n} \\ & = & \frac{1}{n} Var(X_j) - \frac{\sigma^2}{n} \\ & = & \frac{\sigma^2}{n} - \frac{\sigma^2}{n} \\ & = & 0 \end{eqnarray*} %} % End size \end{frame} \section{Multivariate Normal} \begin{frame} \frametitle{The Multivariate Normal Distribution} The $p \times 1$ random vector $\mathbf{X}$ is said to have a \emph{multivariate normal distribution}, and we write $\mathbf{X} \sim N(\boldsymbol{\mu},\boldsymbol{\Sigma})$, if $\mathbf{X}$ has (joint) density \begin{displaymath} f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left[ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right], \end{displaymath} where $\boldsymbol{\mu}$ is $p \times 1$ and $\boldsymbol{\Sigma}$ is $p \times p$ symmetric and positive definite. \end{frame} \begin{frame} \frametitle{$\boldsymbol{\Sigma}$ positive definite} \begin{itemize} \item Positive definite means that for any non-zero $p \times 1$ vector $\mathbf{a}$, we have $\mathbf{a}^\prime \boldsymbol{\Sigma} \mathbf{a} > 0$. \item Since the one-dimensional random variable $Y=\sum_{i=1}^p a_i X_i$ may be written as $Y=\mathbf{a}^\prime \mathbf{X}$ and $Var(Y)=V(\mathbf{a}^\prime \mathbf{X})=\mathbf{a}^\prime \boldsymbol{\Sigma} \mathbf{a}$, it is natural to require that $\boldsymbol{\Sigma}$ be positive definite. \item All it means is that every non-zero linear combination of $\mathbf{X}$ values has a positive variance. \item And recall $\boldsymbol{\Sigma}$ positive definite is equivalent to $\boldsymbol{\Sigma}^{-1}$ positive definite. \end{itemize} \end{frame} \begin{frame} \frametitle{Analogies} \framesubtitle{(Multivariate normal reduces to the univariate normal when $p=1$)} \begin{itemize} \item Univariate Normal \begin{itemize} \item $f(x) = \frac{1}{\sigma \sqrt{2\pi}} \exp \left[-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}\right]$ \item $E(X)=\mu, V(X) = \sigma^2$ \item $\frac{(X-\mu)^2}{\sigma^2} \sim \chi^2 (1)$ \end{itemize} \vspace{3mm} \item Multivariate Normal \begin{itemize} \item $f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left[ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right]$ \item $E(\mathbf{X})= \boldsymbol{\mu}$, $V(\mathbf{X}) = \boldsymbol{\Sigma}$ \item $(\mathbf{X}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$ \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{More properties of the multivariate normal} % \begin{itemize} \item If $\mathbf{c}$ is a vector of constants, $\mathbf{X}+\mathbf{c} \sim N(\mathbf{c}+\boldsymbol{\mu},\boldsymbol{\Sigma})$ \item If $\mathbf{A}$ is a matrix of constants, $\mathbf{AX} \sim N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\prime)$ \item Linear combinations of multivariate normals are multivariate normal. \item All the marginals (dimension less than $p$) of $\mathbf{X}$ are (multivariate) normal, but it is possible in theory to have a collection of univariate normals whose joint distribution is not multivariate normal. \item For the multivariate normal, zero covariance implies independence. The multivariate normal is the only continuous distribution with this property. \end{itemize} \end{frame} \begin{frame} \frametitle{An easy example} \framesubtitle{If you do it the easy way} Let $\mathbf{X}= (X_1,X_2,X_3)^\prime$ be multivariate normal with \begin{displaymath} \boldsymbol{\mu} = \left[ \begin{array}{c} 1 \\ 0 \\ 6 \end{array} \right] \mbox{ and } \boldsymbol{\Sigma} = \left[ \begin{array}{c c c} 2 & 1 & 0 \\ 1 & 4 & 0 \\ 0 & 0 & 2 \end{array} \right] . \end{displaymath} Let $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$. Find the joint distribution of $Y_1$ and $Y_2$. % Just for fun, check it with sage: %mu = vector(QQ,[1,0,6]).column() # QQ is the rational field %Sigma = matrix(QQ,[[2,1,0],[1,4,0],[0,0,2]]) %A = matrix(QQ,[[1,1,0],[0,1,1]]) %mu2 = A*mu; show(mu2) %Sigma2 = A*Sigma*A.transpose(); show(Sigma2) \end{frame} \begin{frame} \frametitle{In matrix terms} $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$ means $\mathbf{Y} = \mathbf{AX}$ \vspace{10mm} \begin{displaymath} \left[ \begin{array}{c} Y_1 \\ Y_2 \end{array} \right] = \left[ \begin{array}{c c c} 1 & 1 & 0 \\ 0 & 1 & 1 \end{array} \right] \left[ \begin{array}{c} X_1 \\ X_2 \\ X_3 \end{array} \right] \end{displaymath} \vspace{10mm} $\mathbf{Y} = \mathbf{AX} \sim N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\prime)$ \end{frame} \begin{frame}[fragile] \frametitle{You could do it by hand, but} %\framesubtitle{} \begin{verbatim} > mu = cbind(c(1,0,6)) > Sigma = rbind( c(2,1,0), + c(1,4,0), + c(0,0,2) ) > A = rbind( c(1,1,0), + c(0,1,1) ); A > A %*% mu # E(Y) [,1] [1,] 1 [2,] 6 > A %*% Sigma %*% t(A) # V(Y) [,1] [,2] [1,] 8 5 [2,] 5 6 \end{verbatim} \end{frame} \begin{frame} \frametitle{Multivariate normal likelihood} {\footnotesize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& \prod_{i=1}^n \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left\{ -\frac{1}{2} (\mathbf{x}_i-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})\right\} \\ &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp\left\{ -\frac{1}{2} \sum_{i=1}^n (\mathbf{x}_i-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})\right\} \\ &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\}, \end{eqnarray*} } where $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\prime $ is the sample variance-covariance matrix. \end{frame} \begin{frame} \frametitle{Showing the details} \framesubtitle{For the multivariate normal likelihood} Adding and subtracting $\overline{\mathbf{x}}$ in $\sum_{i=1}^n (\mathbf{x}_i-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})$, we get \begin{eqnarray*} & & \sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}} + \overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}} + \overline{\mathbf{x}}-\boldsymbol{\mu}) \\ & = & \sum_{i=1}^n (\mathbf{a}_i+\mathbf{b})^\prime \boldsymbol{\Sigma}^{-1} (\mathbf{a}_i+\mathbf{b}) \\ & = & \sum_{i=1}^n \left( \mathbf{a}_i^\prime \boldsymbol{\Sigma}^{-1} \mathbf{a}_i + \mathbf{a}_i^\prime \boldsymbol{\Sigma}^{-1} \mathbf{b} + \mathbf{b}^\prime \boldsymbol{\Sigma}^{-1} \mathbf{a}_i + \mathbf{b}^\prime \boldsymbol{\Sigma}^{-1} \mathbf{b} \right) \\ & = & \left(\sum_{i=1}^n \mathbf{a}_i^\prime \boldsymbol{\Sigma}^{-1} \mathbf{a}_i \right) + \mathbf{0} + \mathbf{0} + n \, \mathbf{b}^\prime \boldsymbol{\Sigma}^{-1} \mathbf{b} \\ & = & {\color{red} \sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}}) } ~+~ n \, (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Continuing the calculation} {\small \begin{eqnarray*} {\color{red} \sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}}) }%End color & = & \sum_{i=1}^n tr\left\{(\mathbf{x}_i-\overline{\mathbf{x}})^\prime {\color{blue} \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}})}%End color \right\} \\ & = & \sum_{i=1}^n tr\left\{ {\color{blue} \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}})}%End color (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \right\} \\ & = & tr \left\{\sum_{i=1}^n \boldsymbol{\Sigma}^{-1} (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \right\} \\ & = & tr \left\{ \boldsymbol{\Sigma}^{-1}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \right\} \\ & = & n \, tr \left\{ \boldsymbol{\Sigma}^{-1} \frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\prime \right\} \\ & = & n \, tr \left( \boldsymbol{\Sigma}^{-1} \boldsymbol{\widehat{\Sigma}} \right) \\ \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Substituting \ldots} {\small \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp\left\{ -\frac{1}{2} \sum_{i=1}^n (\mathbf{x}_i-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}_i-\boldsymbol{\mu})\right\} \\ &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\}, \end{eqnarray*} \vspace{10mm} where $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\prime $ is the sample variance-covariance matrix. } % End size \end{frame} \begin{frame} \frametitle{Maximizing the likelihood over $\boldsymbol{\mu}$ for any positive definite $\boldsymbol{\Sigma}$ without calculus} {\small \begin{displaymath} L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \end{displaymath} } % End size \begin{itemize} \item Take log, maximize $-(\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu})$. \item That is, minimize $(\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu})$. \item Because $\boldsymbol{\Sigma}$ is positive definite, so is $\boldsymbol{\Sigma}^{-1}$. % Assign as homework. \item Thus $(\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) > 0$ for $\overline{\mathbf{x}}-\boldsymbol{\mu} \neq 0$ \item And equal to zero only when $\boldsymbol{\mu}=\overline{\mathbf{x}}$. \item So that's where the likelihood has its maximum, for each $\boldsymbol{\Sigma}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Showing $(\mathbf{X}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$} \begin{eqnarray*} \mathbf{Y} = \mathbf{X}-\boldsymbol{\mu} & \sim & N\left(\mathbf{0},\ \boldsymbol{\Sigma}\right) \\ \mathbf{Z} = \boldsymbol{\Sigma}^{-\frac{1}{2}} \mathbf{Y} & \sim & N\left(\mathbf{0}, \boldsymbol{\Sigma}^{-\frac{1}{2}} \boldsymbol{\Sigma} \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ & = & N\left(\mathbf{0}, \boldsymbol{\Sigma}^{-\frac{1}{2}} \boldsymbol{\Sigma}^{\frac{1}{2}} ~ \boldsymbol{\Sigma}^{\frac{1}{2}} \boldsymbol{\Sigma}^{-\frac{1}{2}} \right) \\ & = & N\left(\mathbf{0}, \mathbf{I}\right) \end{eqnarray*} So $\mathbf{Z}$ is a vector of $p$ independent standard normals, and \begin{displaymath} \mathbf{Y}^\prime \boldsymbol{\Sigma}^{-1} \mathbf{Y} = \mathbf{Z}^\prime \mathbf{Z} = \sum_{j=1}^p Z_i^2 \sim \chi^2(p) ~~~~~~~~~~ \blacksquare \end{displaymath} \end{frame} \begin{frame} \frametitle{$\overline{X}$ and $S^2$ independent } \begin{displaymath} \begin{array}{lcl} \mathbf{X} = \left( \begin{array}{c} X_1 \\ \vdots \\ X_n \end{array} \right) \sim N\left(\mu\mathbf{1},\sigma^2\mathbf{I} \right) &~~~~& \mathbf{Y} = \left( \begin{array}{c} X_1-\overline{X} \\ \vdots \\ X_{n-1}-\overline{X} \\\\ \overline{X} \end{array} \right) = \mathbf{AX} \end{array} \end{displaymath} \end{frame} \begin{frame} \frametitle{$\mathbf{Y} = \mathbf{AX}$} \framesubtitle{In more detail} \begin{displaymath} % \mathbf{AX} = \left( \begin{array}{rrcrr} 1-\frac{1}{n} & -\frac{1}{n} & \cdots & -\frac{1}{n} & -\frac{1}{n} \\ & & & & \\ -\frac{1}{n} & 1-\frac{1}{n} & \cdots & -\frac{1}{n} & -\frac{1}{n} \\ \vdots\, & \vdots\, & \vdots\, & \vdots\, & \vdots\, \\ -\frac{1}{n} & -\frac{1}{n} & \cdots &1-\frac{1}{n} & -\frac{1}{n} \\ & & & & \\ \frac{1}{n} & \frac{1}{n} & \cdots & \frac{1}{n} & \frac{1}{n} \\ \end{array} \right) \left( \begin{array}{c} X_1 \\ \\ X_2 \\ \vdots \\ X_{n-1} \\ \\ X_n \end{array} \right) = \left( \begin{array}{c} X_1-\overline{X} \\ \\ X_2-\overline{X} \\ \vdots \\ X_{n-1}-\overline{X} \\\\ \overline{X} \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{The argument} \begin{displaymath} \mathbf{Y} = \mathbf{AX} = \left( \begin{array}{c} X_1-\overline{X} \\ \vdots \\ X_{n-1}-\overline{X} \\\\ \overline{X} \end{array} \right) = \left( \begin{array}{c} \\\\ \mathbf{Y}_2 \\\\ \hline \\ \overline{X} \end{array} \right) \end{displaymath} \begin{itemize} \item $\mathbf{Y}$ is multivariate normal. \item $Cov\left(\overline{X},(X_j-\overline{X})\right)=0$ (Exercise) \item So $\overline{X}$ and $\mathbf{Y}_2$ are independent. \item So $\overline{X}$ and $S^2 = g(\mathbf{Y}_2)$ are independent. ~~$\blacksquare$ \end{itemize} \end{frame} \begin{frame} \frametitle{Leads to the $t$ distribution} %\framesubtitle{} If \begin{itemize} \item $Z \sim N(0,1)$ and \item $Y \sim \chi^2(\nu)$ and \item $Z$ and $Y$ are independent, then \end{itemize} \begin{displaymath} T = \frac{Z}{\sqrt{Y/\nu}} \sim t(\nu) \end{displaymath} \end{frame} \begin{frame} \frametitle{Random sample from a normal distribution} Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} N(\mu,\sigma^2)$. Then \begin{itemize} \item $\frac{\sqrt{n}(\overline{X}-\mu)}{\sigma} \sim N(0,1)$ and \item $\frac{(n-1)S^2}{\sigma^2} \sim \chi^2(n-1)$ and \item These quantities are independent, so \begin{eqnarray*} T & = & \frac{\sqrt{n}(\overline{X}-\mu)/\sigma} {\sqrt{\frac{(n-1)S^2}{\sigma^2}/(n-1)}} \\ &&\\ & = & \frac{\sqrt{n}(\overline{X}-\mu)}{S} \sim t(n-1) \end{eqnarray*} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s31}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%