% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode\pagebreak % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Large-sample Likelihood Ratio Tests\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \section{Introduction} \begin{frame} \frametitle{Model and null hypothesis} % \framesubtitle{} \begin{displaymath} \begin{array}{l} D_1, \ldots, D_n \stackrel{i.i.d.}{\sim} P_\theta, \, \theta \in \Theta, \\ \pause H_0: \theta \in \Theta_0 \mbox{ v.s. } H_A: \theta \in \Theta \cap \Theta_0^c, \\ \end{array} \end{displaymath} \pause The data have likelihood function \pause \begin{displaymath} L(\theta) = \prod_{i=1}^n f(d_i;\theta), \end{displaymath} \pause where $f(d_i;\theta)$ is the density or probability mass function evaluated at $d_i$. \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{$\begin{array}{l} D_1, \ldots, D_n \stackrel{i.i.d.}{\sim} P_\theta, \, \theta \in \Theta, \\ H_0: \theta \in \Theta_0 \mbox{ v.s. } H_A: \theta \in \Theta \cap \Theta_0^c, \\ \end{array}$ } \pause \begin{displaymath} \begin{array}{l} D_1, \ldots, D_n \stackrel{i.i.d.}{\sim} N(\mu,\sigma^2) \\ \pause H_0: \mu=\mu_0 \mbox{ v.s. } H_A: \mu \neq \mu_0 \\ \pause \Theta_0 = \{(\mu,\sigma^2): \mu=\mu_0, \sigma^2 > 0 \} \end{array} \end{displaymath} \pause \begin{center} \includegraphics[width=1.8in]{ParameterSpace} \end{center} \end{frame} \begin{frame} \frametitle{Likelihood ratio} \pause %\framesubtitle{} \begin{itemize} \item Let $\widehat{\theta}$ denote the usual Maximum Likelihood Estimate (MLE). \pause \item That is, $\widehat{\theta}$ is the parameter value for which the likelihood function is greatest, over all $\theta \in \Theta$. \pause \item Let $\widehat{\theta}_0$ denote the \emph{restricted} MLE. \pause The restricted MLE is the parameter value for which the likelihood function is greatest, over all $\theta \in \Theta_0$. \pause \item $\widehat{\theta}_0$ is \emph{restricted} by the null hypothesis $H_0: \theta \in \Theta_0$. \pause \item $L(\widehat{\theta}_0) \leq L(\widehat{\theta})$, so that \pause \item The \emph{likelihood ratio} $\lambda = \frac{L(\widehat{\theta}_0)}{L(\widehat{\theta})} \leq 1.$ \pause \item The likelihood ratio will equal one if and only if the overall MLE $\widehat{\theta}$ is located in $\Theta_0$. In this case, there is no reason to reject the null hypothesis. \end{itemize} \end{frame} \begin{frame} \frametitle{The test statistic} \framesubtitle{It's like comparing a full to a reduced model} \begin{itemize} \item We know $\lambda = \frac{L(\widehat{\theta}_0)}{L(\widehat{\theta})} \leq 1.$ \pause \item If it's a \emph{lot} less than one, then the data are a lot less likely to have been observed under the null hypothesis than under the alternative hypothesis, and the null hypothesis is questionable. \pause \item If $\lambda$ is small (close to zero), then $\ln(\lambda)$ is a large negative number, and $-2\ln\lambda$ is a large positive number. \pause \end{itemize} \begin{displaymath} G^2 = -2 \ln \left( \frac{\max_{\theta \in \Theta_0} L(\theta)} {\max_{\theta \in \Theta} L(\theta) } \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Difference between two $-2$ log likelihoods} \pause %\framesubtitle{} \begin{eqnarray*} G^2 & = & -2 \ln \left( \frac{\max_{\theta \in \Theta_0} L(\theta)} {\max_{\theta \in \Theta} L(\theta) } \right) \nonumber \\ \pause & = & -2 \ln \left( \frac{ L(\widehat{\theta}_0) } {L(\widehat{\theta}) } \right) \nonumber \\ \pause & = & -2 \ln L(\widehat{\theta}_0) - [-2 \ln L(\widehat{\theta})] \nonumber \\ \pause & = & -2\ell(\widehat{\theta}_0) - [-2\ell(\widehat{\theta})] . \pause \end{eqnarray*} \begin{itemize} \item Could minimize $-2\ell(\theta)$ twice, first over all $\theta \in \Theta$, and then over all $\theta \in \Theta_0$. \pause \item The test statistic is the difference between the two minimum values. \end{itemize} \end{frame} \begin{frame} \frametitle{Distribution of the test statistic under $H_0$} \framesubtitle{Approximate large sample distribution (Wilks, 1936)} \pause Suppose the null hypothesis is that certain \emph{linear combinations} of parameter values are equal to specified constants. \pause Then if $H_0$ is true, \pause \begin{displaymath} G^2 = -2 \ln \left( \frac{L(\widehat{\theta}_0)} { L(\widehat{\theta}) } \right) \end{displaymath} has an approximate chi-squared distribution for large $n$. \pause \begin{itemize} \item Degrees of freedom equals number of (non-redundant, linearly independent) equalities specified by $H_0$. \pause \item So count the equals signs. \pause \item Reject when $G^2$ is large. \end{itemize} \end{frame} \begin{frame} \frametitle{Example} %\framesubtitle{} Suppose $\boldsymbol{\theta} = (\theta_1, \ldots \theta_7)$, \pause with \begin{displaymath} H_0: ~\theta_1=\theta_2, \theta_6=\theta_7, \frac{1}{3}\left(\theta_1+\theta_2+\theta_3\right) = \frac{1}{3}\left(\theta_4+\theta_5+\theta_6\right) \end{displaymath} \pause Count the equals signs or write the null hypothesis in matrix form as $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$. \pause \begin{displaymath} \left( \begin{array}{r r r r r r r} 1 & -1 & ~0 & ~0 & ~0 & ~0 & ~0 \\ 0 & 0 & 0 & 0 & 0 & 1 & -1 \\ 1 & 1 & 1 & -1 & -1 & -1 & 0 \\ \end{array} \right) \left( \begin{array}{r} \theta_1 \\ \theta_2 \\ \theta_3 \\ \theta_4 \\ \theta_5 \\ \theta_6 \\ \theta_7 \end{array} \right) = \left( \begin{array}{r} 0 \\ 0 \\ 0 \end{array} \right) \end{displaymath} \pause Rows are linearly independent, so $df$=number of rows = 3. \end{frame} \begin{frame} \frametitle{Bernoulli example} \pause %\framesubtitle{} \begin{itemize} \item $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$ \pause \item $H_0:\theta=\theta_0$ \pause \item $\Theta=(0,1)$ \pause \item $\Theta_0 = \{\theta_0\}$ \pause \item $L(\theta) = \theta^{\sum_{i=1}^n y_i} (1-\theta)^{n-\sum_{i=1}^n y_i}$ \pause \item $\widehat{\theta} = \overline{y}$ \pause \item $\widehat{\theta}_0 = \theta_0$ \end{itemize} \end{frame} \begin{frame} \frametitle{Likelihood ratio test statistic} \framesubtitle{$L(\theta) = \theta^{\sum_{i=1}^n y_i} (1-\theta)^{n-\sum_{i=1}^n y_i}$} \begin{eqnarray*} G^2 & = & -2\ln\frac{L(\widehat{\theta}_0)} {L(\widehat{\theta})} \\ \pause & = & -2\ln\frac{\theta_0^{n\overline{y}} (1-\theta_0)^{n(1-\overline{y})}} {\overline{y}^{n\overline{y}} (1-\overline{y})^{n(1-\overline{y})}} \\ \pause & = & -2\ln\left(\frac{\theta_0^{\overline{y}} (1-\theta_0)^{(1-\overline{y})}} {\overline{y}^{\overline{y}} (1-\overline{y})^{(1-\overline{y})}} \right)^n \\ \pause & = & 2n\ln\left(\frac{\theta_0^{\overline{y}} (1-\theta_0)^{(1-\overline{y})}} {\overline{y}^{\overline{y}} (1-\overline{y})^{(1-\overline{y})}} \right)^{-1} \\ \pause & = & 2n\ln\frac{\overline{y}^{\overline{y}} (1-\overline{y})^{(1-\overline{y})}} {\theta_0^{\overline{y}} (1-\theta_0)^{(1-\overline{y})}} \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Continued} \begin{eqnarray*} G^2 & = & 2n\ln\frac{\overline{y}^{\overline{y}} (1-\overline{y})^{(1-\overline{y})}} {\theta_0^{\overline{y}} (1-\theta_0)^{(1-\overline{y})}} \\ \pause & = & 2n\left( \ln \left(\frac{\overline{y}}{\theta_0}\right)^{\overline{y}} + \ln \left(\frac{1-\overline{y}}{1-\theta_0}\right)^{(1-\overline{y})}\right) \\ \pause & = & 2n\left( \overline{y}\ln \left(\frac{\overline{y}}{\theta_0}\right) + (1-\overline{y}) \ln \left(\frac{1-\overline{y}}{1-\theta_0}\right)\right) \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Coffee taste test} \framesubtitle{$n=100,~\theta_0=0.50,~\overline{y}=0.60$} \pause \begin{eqnarray*} G^2 & = & 2n\left( \overline{y}\ln \left(\frac{\overline{y}}{\theta_0}\right) + (1-\overline{y}) \ln \left(\frac{1-\overline{y}}{1-\theta_0}\right)\right) \\ \pause & = & 200\left( 0.60\ln \left(\frac{0.60}{0.50}\right) + 0.40 \ln \left(\frac{0.40}{0.50}\right)\right) \\ & = & 4.027 \end{eqnarray*} \pause $df=1$, critical value $1.96^2=3.84$. Conclude (barely) that the new coffee blend is preferred over the old. \end{frame} \begin{frame} \frametitle{Univariate normal example} %\framesubtitle{} \begin{itemize} \item $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} N(\mu,\sigma^2)$ \pause \item $H_0:\mu=\mu_0$ \pause \item $\Theta=\{(\mu,\sigma^2): \mu \in \mathbb{R}, \sigma^2>0\}$ \pause \item $\Theta_0 = \{(\mu,\sigma^2): \mu=\mu_0, \sigma^2>0\}$ \pause \item $L(\theta) = (\sigma^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\sigma^2}\sum_{i=1}^n(y_i-\mu)^2\}$ \pause % \item $\ell(\theta) = -\frac{n}{2}\ln\sigma^2 - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2$ \item $\widehat{\theta} = \left(\overline{Y}, \widehat{\sigma}^2 \right)$, \pause where \begin{displaymath} \widehat{\sigma}^2 = \frac{1}{n}\sum_{i=1}^n(Y_i-\overline{Y})^2 \end{displaymath} \pause \item $\widehat{\theta}_0 = (\widehat{\mu}_0,\widehat{\sigma}_0^2) = \ldots$ \end{itemize} \end{frame} \begin{frame} \frametitle{Restricted MLE} \framesubtitle{For $H_0: \mu=\mu_0$} Definitely have $ \widehat{\mu}_0 = \mu_0$. \pause \vspace{3mm} Recall that setting derivaties to zero, we obtained \pause \begin{displaymath} \mu = \overline{y} \mbox{ and } \sigma^2 = \frac{1}{n}\sum_{i=1}^n(y_i-\mu)^2, \mbox{ so} \end{displaymath} \pause {\LARGE \begin{eqnarray*} \widehat{\mu}_0 & = & \mu_0 \\ \pause \widehat{\sigma}_0^2 & = & \frac{1}{n}\sum_{i=1}^n(Y_i-\mu_0)^2 \\ \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Likelihood ratio test statistic $G^2 = -2\ln\frac{L(\widehat{\theta}_0)} {L(\widehat{\theta})}$} \pause % \framesubtitle{} Have $L(\theta) = (\sigma^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\sigma^2}\sum_{i=1}^n(y_i-\mu)^2\}$, so \pause \vspace{3mm} \begin{eqnarray*} L(\widehat{\theta}) & = & (\widehat{\sigma}^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\widehat{\sigma}^2}\sum_{i=1}^n(y_i-\overline{y})^2\} \\ \pause & = & (\widehat{\sigma}^2)^{-n/2} (2\pi)^{-n/2} \exp\left\{-\frac{\sum_{i=1}^n(y_i-\overline{y})^2}{2\frac{1}{n}\sum_{i=1}^n(y_i-\overline{y})^2}\right\} \\ \pause & = & (\widehat{\sigma}^2)^{-n/2} (2\pi)^{-n/2} e^{-n/2} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Likelihood at restricted MLE} \framesubtitle{$L(\theta) = (\sigma^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\sigma^2}\sum_{i=1}^n(y_i-\mu)^2\}$} \pause \begin{eqnarray*} L(\widehat{\theta}_0) & = & (\widehat{\sigma}_0^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\widehat{\sigma}_0^2}\sum_{i=1}^n(y_i-\mu_0)^2\} \\ \pause & = & (\widehat{\sigma}_0^2)^{-n/2} (2\pi)^{-n/2} \exp\left\{-\frac{\sum_{i=1}^n(y_i-\mu_0)^2}{2\frac{1}{n}\sum_{i=1}^n(y_i-\mu_0)^2}\right\} \\ \pause & = & (\widehat{\sigma}_0^2)^{-n/2} (2\pi)^{-n/2} e^{-n/2} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Test statistic} %\framesubtitle{} {\small \begin{eqnarray*} G^2 & = & -2\ln\frac{L(\widehat{\theta}_0)} {L(\widehat{\theta})} \\ \pause & = & -2\ln\frac{(\widehat{\sigma}_0^2)^{-n/2} (2\pi)^{-n/2} e^{-n/2}} {(\widehat{\sigma}^2)^{-n/2} (2\pi)^{-n/2} e^{-n/2}} \\ \pause & = & -2\ln\left(\frac{\widehat{\sigma}_0^2}{\widehat{\sigma}^2}\right)^{-n/2} \\ \pause & = & n\ln \left(\frac{\widehat{\sigma}_0^2}{\widehat{\sigma}^2}\right) \\ \pause & = & n\ln \left(\frac{\frac{1}{n}\sum_{i=1}^n(Y_i-\mu_0)^2} {\frac{1}{n}\sum_{i=1}^n(Y_i-\overline{Y})^2}\right) \\ \pause & = & n\ln \left(\frac{\sum_{i=1}^n(Y_i-\mu_0)^2} {\sum_{i=1}^n(Y_i-\overline{Y})^2}\right) \\ \end{eqnarray*} } % End size \end{frame} \section{Multivariate Normal} \begin{frame} \frametitle{Multivariate normal likelihood} \framesubtitle{SAS \texttt{proc calis} default} \pause {\footnotesize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& \prod_{i=1}^n \frac{1}{|\boldsymbol{\Sigma}|^{\frac{1}{2}} (2 \pi)^{\frac{p}{2}}} \exp\left\{ -\frac{1}{2} (\mathbf{y}_i-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{y}_i-\boldsymbol{\mu})\right\} \\ \pause &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp\left\{ -\frac{1}{2} \sum_{i=1}^n (\mathbf{y}_i-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1}(\mathbf{y}_i-\boldsymbol{\mu})\right\} \\ \pause &&\\ &=& \cdots \\ \pause &&\\ &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\}, \end{eqnarray*} \pause where $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{y}_i-\overline{\mathbf{y}}) (\mathbf{y}_i-\overline{\mathbf{y}})^\top $ is the sample variance-covariance matrix. } % End size \end{frame} \begin{frame} \frametitle{Sample variance-covariance matrix} \pause %\framesubtitle{} \begin{displaymath} \mathbf{Y}_i = \left(\begin{array}{c} Y_{i,1} \\ \vdots \\ Y_{i,p} \end{array} \right) ~~~~~~~~~~ \overline{\mathbf{Y}} = \left(\begin{array}{c} \overline{Y}_1 \\ \vdots \\ \overline{Y}_p \end{array} \right) \end{displaymath} \pause \vspace{3mm} $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{Y}_i-\overline{\mathbf{Y}}) (\mathbf{Y}_i-\overline{\mathbf{Y}})^\top $ is a $p \times p$ matrix with $(j,k)$ element \pause \begin{displaymath} \frac{1}{n}\sum_{i=1}^n (Y_{i,j}-\overline{Y}_j)(Y_{i,k}-\overline{Y}_k) \end{displaymath} \pause This is a sample variance or covariance. \end{frame} \begin{frame} \frametitle{Multivariate normal likelihood at the MLE} \framesubtitle{This will be in the denominator of the likelihood ratio test.} \pause {\footnotesize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-\frac{n}{2}} (2\pi)^{-\frac{np}{2}} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\} \\ \pause &&\\ L(\widehat{\boldsymbol{\mu}}, \widehat{\boldsymbol{\Sigma}}) &=& |\widehat{\boldsymbol{\Sigma}}|^{-\frac{n}{2}} (2\pi)^{-\frac{np}{2}} e^{-\frac{np}{2}} \\ \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{Example: Test whether a set of normal random variables are independent} \framesubtitle{Equivalent to zero covariance} \pause \begin{itemize} \item $\mathbf{Y}_1, \ldots, \mathbf{Y}_n \stackrel{i.i.d.}{\sim} N_p(\boldsymbol{\mu,\Sigma})$ \pause \item $H_0: \sigma_{ij}=0$ for $i \neq j$. \pause \item Equivalent to independence for this multivariate normal model. \pause \item Use $G^2 = -2 \ln \left( \frac{L(\widehat{\theta}_0)} {L(\widehat{\theta}) } \right)$. \pause \item $df=$$p\choose{2}$ \pause \item Have $L(\widehat{\theta})$. \item Need $L(\widehat{\theta}_0)$. \end{itemize} \end{frame} \begin{frame} \frametitle{Getting the restricted MLE} %\framesubtitle{} For the multivariate normal, zero covariance is equivalent to independence, so under $H_0$, \pause \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) & = & \prod_{i=1}^n f(\mathbf{y}_i|\boldsymbol{\mu,\Sigma}) \\ \pause & = & {\color{red}\prod_{i=1}^n} \left( {\color{blue}\prod_{j=1}^p} f(y_{ij}|\mu_j,\sigma^2_j) \right) \\ \pause & = & {\color{blue}\prod_{j=1}^p} \left( {\color{red}\prod_{i=1}^n} f(y_{ij}|\mu_j,\sigma^2_j) \right) \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Take logs and start differentiating} %\framesubtitle{} \begin{eqnarray*} L(\boldsymbol{\mu}_0,\boldsymbol{\Sigma_0}) & = & \prod_{j=1}^p \left( \prod_{i=1}^n f(y_{ij}|\mu_j,\sigma^2_j) \right) \\ \pause &&\\ \ell(\boldsymbol{\mu_0,\Sigma_0}) & = & \sum_{j=1}^p \ln\left( \prod_{i=1}^n f(y_{ij}|\mu_j,\sigma^2_j) \right) \\ \end{eqnarray*} \pause \vspace{5mm} It's just $j$ univariate problems, which we have already done. \end{frame} \begin{frame} \frametitle{Likelihood at the restricted MLE} %\framesubtitle{} \begin{eqnarray*} L(\widehat{\boldsymbol{\mu}}_0, \widehat{\boldsymbol{\Sigma}}_0) & = & \prod_{j=1}^p \left( (\widehat{\sigma}_j^2)^{-n/2} (2\pi)^{-n/2} \exp\{-\frac{1}{2\widehat{\sigma}_j^2}\sum_{i=1}^n(y_{ij}-\overline{y}_j)^2\} \right) \\ \pause &&\\ & = & \prod_{j=1}^p \left( (\widehat{\sigma}_j^2)^{-n/2} (2\pi)^{-n/2} e^{-n/2} \right) \\ \pause &&\\ & = & \left( \prod_{j=1}^p \widehat{\sigma}_j^2 \right)^{-\frac{n}{2}} (2\pi)^{-\frac{np}{2}} e^{-\frac{np}{2}}, \pause \end{eqnarray*} where $\widehat{\sigma}_j^2$ is a diagonal element of $\widehat{\boldsymbol{\Sigma}}$. \end{frame} \begin{frame} \frametitle{Test statistic} %\framesubtitle{} {\small \begin{eqnarray*} G^2 & = & -2\ln\frac{L(\widehat{\theta}_0)} {L(\widehat{\theta})} \\ \pause & = & -2\ln\frac{\left( \prod_{j=1}^p \widehat{\sigma}_j^2 \right)^{-\frac{n}{2}} (2\pi)^{-\frac{np}{2}} e^{-\frac{np}{2}}} { |\widehat{\boldsymbol{\Sigma}}|^{-\frac{n}{2}} (2\pi)^{-\frac{np}{2}} e^{-\frac{np}{2}} } \\ \pause & = & -2\ln \left(\frac{\prod_{j=1}^p \widehat{\sigma}_j^2} {|\widehat{\boldsymbol{\Sigma}}|} \right)^{-\frac{n}{2}} \\ \pause & = & n\ln \left(\frac{\prod_{j=1}^p \widehat{\sigma}_j^2} {|\widehat{\boldsymbol{\Sigma}}|} \right) \\ \pause & = & n\left(\sum_{j=1}^p \ln\widehat{\sigma}_j^2 - \ln |\widehat{\boldsymbol{\Sigma}}| \right) \end{eqnarray*} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Cars: Weight, length and fuel consumption} \framesubtitle{$G^2 = n\left(\sum_{j=1}^p \ln\widehat{\sigma}_j^2 - \ln |\widehat{\boldsymbol{\Sigma}}| \right)$} \pause {\scriptsize {\color{blue} \begin{verbatim} > kars = read.table("mcars4.data.txt"); attach(kars) > n = length(lper100k); SigmaHat = var(cbind(weight, length, lper100k)) > SigmaHat = SigmaHat * (n-1)/n # Make it the MLE > SigmaHat \end{verbatim} \pause } % End color \begin{verbatim} weight length lper100k weight 129698.9859 186.4174680 984.089620 length 186.4175 0.2993794 1.472152 lper100k 984.0896 1.4721524 10.729116 \end{verbatim} \pause {\color{blue} \begin{verbatim} > Gsq = n * ( sum(log(diag(SigmaHat))) - log(det(SigmaHat)) ) > Gsq # df=3 \end{verbatim} } % End color \begin{verbatim} [1] 347.7159 \end{verbatim} } % End size \end{frame} \section{Numerical MLEs} \begin{frame} \frametitle{Numerical maximum likelihood and testing} \framesubtitle{For the multivariate normal} \pause \begin{itemize} \item Often an explicit formula for $\widehat{\theta}_0$ is out of the question. \pause \item Maximize the log likelihood numerically. \pause \item Equivalently, minimize $-2\ln L(\boldsymbol{\mu},\boldsymbol{\Sigma})$. \pause \item Equivalently, minimize $-2\ln L(\boldsymbol{\mu},\boldsymbol{\Sigma})$ plus a constant. \pause \item Choose the constant well, and minimize \pause \begin{displaymath} -2\ln L(\boldsymbol{\mu},\boldsymbol{\Sigma}) - (-2\ln L(\widehat{\boldsymbol{\mu}},\widehat{\boldsymbol{\Sigma}})) \end{displaymath} over $(\boldsymbol{\mu},\boldsymbol{\Sigma}) \in \Theta_0$. \pause \item The value of this function at the stopping place is the likelihood ratio test statistic. \end{itemize} \end{frame} \begin{frame} \frametitle{Simplifying \ldots} {\scriptsize \begin{columns} % Use Beamer's columns to use more of the margins \column{1.2\textwidth} \begin{eqnarray*} -2\ln\frac{L(\boldsymbol{\mu},\boldsymbol{\Sigma})} {L(\widehat{\boldsymbol{\mu}} ,\widehat{\boldsymbol{\Sigma}})} & = & \pause -2\ln\frac{|\boldsymbol{\Sigma}|^{-\frac{n}{2}} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\}} {|\widehat{\boldsymbol{\Sigma}}|^{-\frac{n}{2}} e^{-\frac{np}{2}}} \\ \pause & = & -2\ln\left( |\boldsymbol{\Sigma}|^{-\frac{n}{2}} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\} |\widehat{\boldsymbol{\Sigma}}|^{\frac{n}{2}} e^{\frac{np}{2}} \right) \\ \pause & = & -2\ln\left( |\boldsymbol{\Sigma}| \exp\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\} |\widehat{\boldsymbol{\Sigma}}|^{-1} e^{-p} \right)^{-\frac{n}{2}} \\ \pause & = & n \left( tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}\right) - p + \ln |\boldsymbol{\Sigma}| - \ln |\widehat{\boldsymbol{\Sigma}}| + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right) \pause \end{eqnarray*} \end{columns} \vspace{5mm} \begin{itemize} \item To avoid numerical problems in minimizing the function, drop the $n$. \pause \item The result is the ``discrepancy function" $F_{ML}$ on page 1247 of the Version 9.3 \texttt{proc calis} manual. \pause \item The discrepancy function is also called the ``objective function" in other parts of the manual and in the Results file. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Later in the course} \pause \framesubtitle{Recalling $F_{ML} = tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}\right) - p + \ln |\boldsymbol{\Sigma}| - \ln |\widehat{\boldsymbol{\Sigma}}| + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu})$} \begin{itemize} \item Model is based on systems of equations with unknown parameters $\boldsymbol{\theta} \in \Theta$. \pause \item $\boldsymbol{\mu}(\boldsymbol{\theta})$ and $\boldsymbol{\Sigma}(\boldsymbol{\theta})$ are the mean and covariance matrix of the \emph{observable} variables. \pause \item We will give up on the parameters that appear only in $\boldsymbol{\mu}$. \pause Estimate $\boldsymbol{\mu}$ with $\overline{\mathbf{y}}$ and it disappears from $F_{ML}$. \pause \item Calculate the covariance matrix $\boldsymbol{\Sigma} = \boldsymbol{\Sigma}(\boldsymbol{\theta})$ from the model equations. \pause \item Minimize the objective function \pause \begin{displaymath} F_{ML}(\boldsymbol{\theta}) = tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}(\boldsymbol{\theta})^{-1}\right) - p + \ln |\boldsymbol{\Sigma}(\boldsymbol{\theta})| - \ln |\widehat{\boldsymbol{\Sigma}}| \end{displaymath} over all $\boldsymbol{\theta} \in \Theta$. \pause \item The result is $\widehat{\boldsymbol{\theta}}$. \pause Can also obtain $\widehat{\boldsymbol{\theta}}_0$ by minimizing over $\Theta_0$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s17} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Safe copy \begin{frame} \frametitle{What SAS \texttt{proc calis} does} \framesubtitle{Instead of directly minimizing $-2\ln L(\boldsymbol{\mu},\boldsymbol{\Sigma}) - (-2\ln L(\widehat{\boldsymbol{\mu}},\widehat{\boldsymbol{\Sigma}}))$} \pause {\scriptsize \begin{columns} % Use Beamer's columns to use more of the margins \column{1.2\textwidth} \begin{eqnarray*} -2\ln\frac{L(\boldsymbol{\mu},\boldsymbol{\Sigma})} {L(\widehat{\boldsymbol{\mu}} ,\widehat{\boldsymbol{\Sigma}})} & = & -2\ln\frac{|\boldsymbol{\Sigma}|^{-\frac{n}{2}} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\}} {|\widehat{\boldsymbol{\Sigma}}|^{-\frac{n}{2}} e^{-\frac{np}{2}}} \\ \pause & = & -2\ln\left( |\boldsymbol{\Sigma}|^{-\frac{n}{2}} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\} |\widehat{\boldsymbol{\Sigma}}|^{\frac{n}{2}} e^{\frac{np}{2}} \right) \\ \pause & = & -2\ln\left( |\boldsymbol{\Sigma}| \exp\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right\} |\widehat{\boldsymbol{\Sigma}}|^{-1} e^{-p} \right)^{-\frac{n}{2}} \\ \pause & = & n \left( tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}\right) - p + \ln |\boldsymbol{\Sigma}| - \ln |\widehat{\boldsymbol{\Sigma}}| + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu}) \right) \pause \end{eqnarray*} \end{columns} % \vspace{5mm} \begin{itemize} \item To avoid numerical problems in minimizing the function, drop the $n$. \pause \item The result is the ``discrepancy function" $F_{ML}$ on page 1247 of the Version 9.3 \texttt{proc calis} manual. \pause \item Minimize the discrepancy function over all $(\boldsymbol{\mu},\boldsymbol{\Sigma}) \in \Theta_0$ by a numerical search. \pause \item The discrepancy function is also called the ``objective function" in other parts of the manual and in the Results file. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Later in the course} \framesubtitle{Recalling $F_{ML} = tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}\right) - p + \ln |\boldsymbol{\Sigma}| - \ln |\widehat{\boldsymbol{\Sigma}}| + (\overline{\mathbf{y}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{y}}-\boldsymbol{\mu})$} \begin{itemize} \item Model is based on systems of equations with unknown parameters $\boldsymbol{\theta} \in \Theta$. \pause \item $\boldsymbol{\mu}(\boldsymbol{\theta})$ and $\boldsymbol{\Sigma}(\boldsymbol{\theta})$ are the mean and covariance matrix of the \emph{observable} variables. \pause \item We will give up on the parameters that appear only in $\boldsymbol{\mu}$. \pause Estimate $\boldsymbol{\mu}$ with $\overline{\mathbf{y}}$ and it disappears from $F_{ML}$. \item Calculate $\boldsymbol{\Sigma} = \boldsymbol{\Sigma}(\boldsymbol{\theta})$ from the model equations. \pause \item Minimize the objective function \pause \begin{displaymath} F_{ML}(\boldsymbol{\theta}) = tr\left(\boldsymbol{\widehat{\Sigma}\Sigma}(\boldsymbol{\theta})^{-1}\right) - p + \ln |\boldsymbol{\Sigma}(\boldsymbol{\theta})| - \ln |\widehat{\boldsymbol{\Sigma}}| \end{displaymath} over all $\boldsymbol{\theta} \in \Theta$. \pause \item The result is $\widehat{\boldsymbol{\theta}}$. \pause Can also obtain $\widehat{\boldsymbol{\theta}}_0$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} % \stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}_i} kars = read.table("http://www.utstat.utoronto.ca/~brunner/data/legal/mcars4.data.txt"); attach(kars) n = length(lper100k); SigmaHat = var(cbind(weight, length, lper100k)) SigmaHat = SigmaHat * (n-1)/n # Make it the MLE SigmaHat Gsq = n * ( sum(log(diag(SigmaHat))) - log(det(SigmaHat)) ) Gsq # df=3