% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Background\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2015} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Matrices} \begin{frame} \frametitle{Matrices} \begin{itemize} \item $\mathbf{A} = [a_{ij}]$ \pause \item Transpose: $\mathbf{A}^\top = [a_{ji}]$ \pause \item Multiplication: $\mathbf{AB} \neq \mathbf{BA}$ \pause \item $(\mathbf{AB})^\top = \mathbf{B}^\top\mathbf{A}^\top$ \pause \item Inverse of a \emph{square} matrix: $\mathbf{A}^{-1}\mathbf{A} = \mathbf{AA}^{-1} = \mathbf{I}$ \pause \item $(\mathbf{A}^{-1})^\top = (\mathbf{A}^\top)^{-1} $ \pause \item Positive definite: $\mathbf{v}^\top \mathbf{A} \mathbf{v} > 0$ for all $p \times 1$ vectors $\mathbf{v} \neq \mathbf{0}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Trace of a square matrix: Sum of the diagonal elements} \begin{displaymath} tr(\mathbf{A}) = \sum_{i=1}^n a_{i,i} \end{displaymath} \pause \vspace{10mm} \begin{itemize} \item Of course $tr(\mathbf{A}+\mathbf{B}) = tr(\mathbf{A}) + tr(\mathbf{B})$, \pause \item $tr(\mathbf{A}) = tr(\mathbf{A}^\top)$, etc. \pause \item But less obviously, even though $\mathbf{AB} \neq \mathbf{BA}$, \pause \item $tr(\mathbf{AB}) = tr(\mathbf{BA})$ \end{itemize} \end{frame} \begin{frame} \frametitle{$tr(\mathbf{AB}) = tr(\mathbf{BA})$} Let $\mathbf{A}$ be an $r \times p$ matrix and $\mathbf{B}$ be a $p \times r$ matrix, so that the product matrices $\mathbf{AB}$ and $\mathbf{BA}$ are both defined. \pause \begin{eqnarray*} tr(\mathbf{AB}) &=& \sum_{i=1}^r \left(\sum_{k=1}^p a_{i,k}b_{k,i} \right) \\ \pause &=& \sum_{k=1}^p \left(\sum_{i=1}^r b_{k,i}a_{i,k} \right) \\ \pause &=& tr(\mathbf{BA}) \end{eqnarray*} \end{frame} \section{Random Vectors} \begin{frame} \frametitle{Random vectors} \framesubtitle{Expected values and variance-covariance matrices} \begin{itemize} \item $E(\mathbf{X}) = [E(X_{i,j})]$ \pause \item $E(\mathbf{X}+\mathbf{Y}) = E(\mathbf{X})+E(\mathbf{Y})$ \pause \item $E(\mathbf{AXB}) = \mathbf{A}E(\mathbf{X})\mathbf{B}$ \pause \item $V(\mathbf{X}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}) (\mathbf{X}-\boldsymbol{\mu})^\top\right\}$ \pause \item $V(\mathbf{AX}) = \mathbf{A}V(\mathbf{X})\mathbf{A}^\top$ \pause \item $C(\mathbf{X,Y}) = E\left\{ (\mathbf{X}-\boldsymbol{\mu}_x) (\mathbf{Y}-\boldsymbol{\mu}_y)^\top\right\}$ \pause \item $V(\mathbf{X+a}) = V(\mathbf{X})$ \pause \item $C(\mathbf{X+a,Y+b}) =C(\mathbf{X,Y})$ \end{itemize} \end{frame} \begin{frame} \frametitle{The Centering Rule} \framesubtitle{Based on $V(\mathbf{X} + \mathbf{a}) = V(\mathbf{X})$} Often, variance and covariance calculations can be simplified by subtracting off constants first. \pause Denote the \emph{centered} version of $\mathbf{X}$ by $\stackrel{c}{\mathbf{X}} = \mathbf{X} - E(\mathbf{X})$, \pause so that \begin{itemize} \item $E(\stackrel{c}{\mathbf{X}})=\mathbf{0}$ \pause and \item $V(\stackrel{c}{\mathbf{X}}) = E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}}) = V(\mathbf{X})$ \end{itemize} \end{frame} \begin{frame} \frametitle{Linear combinations} \framesubtitle{These are matrices, but they could be scalars} \begin{eqnarray*} \mathbf{L}_{~} & = & \mathbf{A}_1\mathbf{X}_1 + \cdots + \mathbf{A}_m\mathbf{X}_m + \mathbf{b} \\ \pause \stackrel{c}{\mathbf{L}}_{~} & = & \mathbf{A}_1 \stackrel{c}{\mathbf{X}}_1 + \cdots + \mathbf{A}_m \stackrel{c}{\mathbf{X}}_m,\mbox{ where} \\ \pause \stackrel{c}{\mathbf{X}}_j & = & \mathbf{X}_j - E(\mathbf{X}_j) \mbox{ for } j=1,\ldots,m. \end{eqnarray*} \pause % \vspace{5mm} The centering rule says \pause \begin{eqnarray*} V(\mathbf{L}) & = & E(\stackrel{c}{\mathbf{L}}\stackrel{c}{\mathbf{L}} \stackrel{\top}{\vphantom{r}}) \\ \pause C(\mathbf{L}_1,\mathbf{L}_2) & = & E(\stackrel{c}{\mathbf{L}}_1\,\stackrel{c}{\mathbf{L}} \stackrel{\top}{\vphantom{r}_2}) \pause \end{eqnarray*} \vspace{5mm} In words: To calculate variances and covariances of linear combinations, one may simply discard added constants, center all the random vectors, and take expected values of products. \end{frame} \begin{frame} \frametitle{Example: $V(\mathbf{X}+\mathbf{Y})$} \framesubtitle{Using the centering rule} \begin{eqnarray*} V(\mathbf{X}+\mathbf{Y}) & = & E(\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}}) (\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}})^\top \\ \pause & = & E(\stackrel{c}{\mathbf{X}}+\stackrel{c}{\mathbf{Y}}) (\stackrel{c}{\mathbf{X}}\stackrel{\top}{\vphantom{~}} + \stackrel{c}{\mathbf{Y}}\stackrel{\top}{\vphantom{~}}) \\ \pause & = & E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{Y}}\stackrel{c}{\mathbf{Y}} \stackrel{\top}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{X}}\stackrel{c}{\mathbf{Y}} \stackrel{\top}{\vphantom{~}}) + E(\stackrel{c}{\mathbf{Y}}\stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{~}}) \\ \pause & = & V(\mathbf{X}) + V(\mathbf{Y}) + C(\mathbf{X},\mathbf{Y}) + C(\mathbf{Y},\mathbf{X}) \end{eqnarray*} \pause \begin{itemize} \item Does $C(\mathbf{X},\mathbf{Y}) = C(\mathbf{Y},\mathbf{X})$? \pause \item Does $C(\mathbf{X},\mathbf{Y}) = C(\mathbf{Y},\mathbf{X})^\top$? \end{itemize} \end{frame} \section{Multivariate Normal} \begin{frame} \frametitle{The Multivariate Normal Distribution} The $p \times 1$ random vector $\mathbf{X}$ is said to have a \emph{multivariate normal distribution}, and we write $\mathbf{X} \sim N(\boldsymbol{\mu},\boldsymbol{\Sigma})$, if $\mathbf{X}$ has (joint) density \begin{displaymath} f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left[ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right], \end{displaymath} where $\boldsymbol{\mu}$ is $p \times 1$ and $\boldsymbol{\Sigma}$ is $p \times p$ symmetric and positive definite. \end{frame} \begin{frame} \frametitle{$\boldsymbol{\Sigma}$ positive definite} \begin{itemize} \item Positive definite means that for any non-zero $p \times 1$ vector $\mathbf{a}$, we have $\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a} > 0$. \pause \item Since the one-dimensional random variable $Y=\sum_{i=1}^p a_i X_i$ may be written as $Y=\mathbf{a}^\top \mathbf{X}$ and $Var(Y) = V(\mathbf{a}^\top \mathbf{X}) = $ $\mathbf{a}^\top \boldsymbol{\Sigma} \mathbf{a}$, it is natural to require that $\boldsymbol{\Sigma}$ be positive definite. \pause \item All it means is that every non-zero linear combination of $\mathbf{X}$ values has a positive variance. \pause \item And recall $\boldsymbol{\Sigma}$ positive definite is equivalent to $\boldsymbol{\Sigma}^{-1}$ positive definite. \end{itemize} \end{frame} \begin{frame} \frametitle{Analogies} % \framesubtitle{()} Multivariate normal reduces to the univariate normal when $p=1$ \vspace{4mm} \begin{itemize} \pause \item Univariate Normal \begin{itemize} \item $f(x) = \frac{1}{\sigma \sqrt{2\pi}} \exp \left[-\frac{1}{2}\frac{(x-\mu)^2}{\sigma^2}\right]$ \pause \item $E(X)=\mu, Var(X) = \sigma^2$ \pause \item $\frac{(X-\mu)^2}{\sigma^2} \sim \chi^2 (1)$ \pause \end{itemize} \vspace{3mm} \item Multivariate Normal \begin{itemize} \item $f(\mathbf{x}) = \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left[ -\frac{1}{2} (\mathbf{x}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})\right]$ \pause \item $E(\mathbf{X})= \boldsymbol{\mu}$, $V(\mathbf{X}) = \boldsymbol{\Sigma}$ \pause \item $(\mathbf{X}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$ \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{More properties of the multivariate normal} % \begin{itemize} \item If $\mathbf{c}$ is a vector of constants, $\mathbf{X}+\mathbf{c} \sim N(\mathbf{c}+\boldsymbol{\mu},\boldsymbol{\Sigma})$ \pause \item If $\mathbf{A}$ is a matrix of constants, $\mathbf{AX} \sim N(\mathbf{A}\boldsymbol{\mu},\mathbf{A}\boldsymbol{\Sigma}\mathbf{A}^\top)$ \pause \item Linear combinations of multivariate normals are multivariate normal. \pause \item All the marginals (dimension less than $p$) of $\mathbf{X}$ are (multivariate) normal. \pause \item For the multivariate normal, zero covariance implies independence. The multivariate normal is the only continuous distribution with this property. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. Except for the picture taken from Carroll et al.'s \emph{Measurement error in non-linear models}, it is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \vspace{3mm} \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s15} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s15}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% {\LARGE \begin{displaymath} \end{displaymath} } \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Omitted ... \begin{frame} \frametitle{Systems of equations correspond to path diagrams} % \framesubtitle{C} \begin{eqnarray*} Y_{i,1} & = & \beta_{0,1} + \beta_1 X_i + \epsilon_{i,1} \\ Y_{i,2} & = & \beta_{0,2} + \beta_2 Y_{i,1} + \epsilon_{i,2} \\ Y_{i,3} & = & \beta_{0,3} + \beta_3 X_i + \beta_4 Y_{i,2} + \epsilon_{i,3} \\ Y_{i,4} & = & \beta_{0,4} + \beta_5 Y_{i,2} + \beta_6 Y_{i,3} + \epsilon_{i,4} \\ D_{i,1} & = & \lambda_{0,1} + \lambda_1 Y_{i,1} + e_{i,1} \\ D_{i,2} & = & \lambda_{0,2} + \lambda_2 X_i + e_{i,2} \\ D_{i,3} & = & \lambda_{0,3} + \lambda_3 Y_{i,2} + e_{i,3} \\ D_{i,4} & = & \lambda_{0,4} + \lambda_4 Y_{i,3} + e_{i,4} \\ D_{i,5} & = & \lambda_{0,5} + \lambda_2 X_i + e_{i,5} \\ D_{i,6} & = & \lambda_{0,6} + \lambda_5 Y_{i,4} + e_{i,6} \\ \end{eqnarray*} \end{frame}