% \documentclass[mathserif]{beamer} % Get Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usetheme{Berlin} % Displays sections on top \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom % \usetheme{Berlin} % Diplays sections on top % \usetheme{Warsaw} % Diplays sections on top % \usetheme{Frankfurt} % Diplays sections on top: Fairly thin but swallows some material at bottom of crowded slides \usepackage[english]{babel} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Likelihood 2: Wald Tests\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2017} \date{} % To suppress date % Cut out a lot of detail in 2014: see 2013 version \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} Davison Chapter 4, especially Sections 4.3 and 4.4 \end{frame} \begin{frame}{Vector of MLEs is Asymptotically Normal}{That is, Multivariate Normal} \pause This yields \pause \begin{itemize} \item Confidence intervals \item $Z$-tests of $H_0: \theta_j=\theta_0$ \item Wald tests \item Score Tests \item Indirectly, the Likelihood Ratio tests \end{itemize} \end{frame} \begin{frame}{Under Regularity Conditions}{(Thank you, Mr. Wald)} \pause \begin{itemize} \item $\widehat{\boldsymbol{\theta}}_n \stackrel{a.s.}{\rightarrow} \boldsymbol{\theta}$ \pause \item $\sqrt{n}(\widehat{\boldsymbol{\theta}}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N_k\left(\mathbf{0}, \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$ \pause \item So we say that $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$. \pause \item $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ is the Fisher Information in one observation. \pause \item A $k \times k$ matrix \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \pause \end{displaymath} \item The Fisher Information in the whole sample is $n\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ \end{itemize} \end{frame} \begin{frame} \frametitle{$\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$} \pause %\framesubtitle{} \begin{itemize} \item Asymptotic covariance matrix of $\widehat{\boldsymbol{\theta}}_n $ is $\frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$, and of course we don't know $\boldsymbol{\theta}$. \pause \item For tests and confidence intervals, we need a good \emph{approximate} asymptotic covariance matrix, \pause \item Based on a consistent estimate of the Fisher information matrix. \pause \item $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_n)$ would do. \pause \item But it's inconvenient\pause: Need to compute partial derivatives and expected values in \pause \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \end{displaymath} \pause and then substitute $\widehat{\boldsymbol{\theta}}_n$ for $\boldsymbol{\theta}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Another approximation of the asymptotic covariance matrix} \pause % \framesubtitle{} Approximate \begin{displaymath} \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1} = \left[ n \, E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right]^{-1} \end{displaymath} \pause with \begin{displaymath} \widehat{\mathbf{V}}_n = \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \end{displaymath} \vspace{4mm} Details of why it's a good approximation are omitted. \end{frame} \begin{frame}{Compare}{Hessian and (Estimated) Asymptotic Covariance Matrix} \pause \begin{itemize} \item $\widehat{\mathbf{V}}_n = \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1}$ \pause \item Hessian at MLE is $\mathbf{H} = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}$ \pause \item So to estimate the asymptotic covariance matrix of $\boldsymbol{\theta}$, just invert the Hessian. \pause \item The Hessian is usually available as a by-product of numerical search for the MLE. \end{itemize} \end{frame} \begin{frame}{Connection to Numerical Optimization} \pause \begin{itemize} \item Suppose we are minimizing the minus log likelihood by a direct search. \pause \item We have reached a point where the gradient is close to zero. Is this point a minimum? \pause \item The Hessian is a matrix of mixed partial derivatives. If all its eigenvalues are positive at a point, the function is concave up there. \pause \item Partial derivatives are often approximated by the slopes of secant lines -- no need to calculate them symbolically. \pause \item It's \emph{the} multivariable second derivative test. \end{itemize} \end{frame} \begin{frame}{So to find the estimated asymptotic covariance matrix} \pause \begin{itemize} \item Minimize the minus log likelihood numerically. \pause \item The Hessian at the place where the search stops is usually available. \pause \item Invert it to get $\widehat{\mathbf{V}}_n$. \pause \item This is so handy that sometimes we do it even when a closed-form expression for the MLE is available. \end{itemize} \end{frame} \begin{frame}{Estimated Asymptotic Covariance Matrix $\widehat{\mathbf{V}}_n$ is Useful} \pause \begin{itemize} \item Asymptotic standard error of $\widehat{\theta}_j$ is the square root of the $j$th diagonal element. \pause \item Denote the asymptotic standard error of $\widehat{\theta}_j$ by $S_{\widehat{\theta}_j}$. \pause \item Thus \begin{displaymath} Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}} \end{displaymath} \pause is approximately standard normal. \end{itemize} \end{frame} \begin{frame}{Confidence Intervals and $Z$-tests} \pause Have $ Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}}$ approximately standard normal, yielding \pause \begin{itemize} \item Confidence intervals: $\widehat{\theta}_j \pm S_{\widehat{\theta}_j} z_{\alpha/2}$ \pause \item Test $H_0: \theta_j=\theta_0$ using \begin{displaymath} Z = \frac{\widehat{\theta}_j-\theta_0}{S_{\widehat{\theta}_j}} \end{displaymath} \end{itemize} \end{frame} \begin{frame}{And Wald Tests} \pause \begin{displaymath} W_n = (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top \left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\top\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \end{displaymath} \pause \vspace{5mm} A very important special case of the earlier \pause \vspace{5mm} \begin{eqnarray*} W_n & = & n \left( \mathbf{LT}_n - \mathbf{h} \right)^\top \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\top \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \\ & = & \left( \mathbf{LT}_n - \mathbf{h} \right)^\top \left(\mathbf{L}\frac{1}{n}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\top \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \end{eqnarray*} \end{frame} \begin{frame}{Score Tests}{Thank you Mr. Rao} \pause \begin{itemize} \item $\widehat{\boldsymbol{\theta}}$ is the MLE of $\boldsymbol{\theta}$, dimension $k \times 1$ \pause \item $\widehat{\boldsymbol{\theta}}_0$ is the MLE under $H_0$, dimension $k \times 1$ \pause \item $\mathbf{u}(\boldsymbol{\theta}) = (\frac{\partial \ell}{\partial \theta_1}, \ldots \frac{\partial \ell}{\partial \theta_k})^\top$ is the gradient. \pause \item $\mathbf{u}(\widehat{\boldsymbol{\theta}})=\mathbf{0}$. \pause \item If $H_0$ is true, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0)$ should also be close to zero too. \pause \item Under $H_0$ for large $N$, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \sim N_k(\mathbf{0},\frac{1}{n}\boldsymbol{\mathcal{I}}(\boldsymbol{\theta}))$, approximately. \pause \item And, \end{itemize} \begin{displaymath} S = \mathbf{u}(\widehat{\boldsymbol{\theta}}_0)^\top \frac{1}{n}\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_0)^{-1} \mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \stackrel{\cdot}{\sim} \chi^2(r) \end{displaymath} Where $r$ is the number of restrictions imposed by $H_0$. \pause Or use the inverse of the Hessian (under $H_0$) instead of $\frac{1}{n}\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_0)$. \end{frame} \begin{frame}{Three Big Tests} \begin{itemize} \item Score Tests: Fit just the restricted model \item Wald Tests: Fit just the unrestricted model \item Likelihood Ratio Tests: Fit Both \end{itemize} \end{frame} \begin{frame}{Comparing Likelihood Ratio and Wald tests} \pause \begin{itemize} \item Asymptotically equivalent under $H_0$, meaning \pause $(W_n-G^2_n) \stackrel{p}{\rightarrow} 0$ \pause % Score too \item Under $H_1$, \pause \begin{itemize} \item Both have the same approximate distribution (non-central chi-square). \pause \item Both go to infinity as $n \rightarrow \infty$. \pause \item But values are not necessarily close. \pause \end{itemize} \item Likelihood ratio test tends to get closer to the right Type I error probability for small samples. \pause \item Wald can be more convenient when testing lots of hypotheses, because you only need to fit the model once. \pause \item Wald can be more convenient if it's a lot of work to write the restricted likelihood. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf17} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf17}} \end{frame} \end{document}