\documentclass[mathserif]{beamer} % Get Computer Modern math font. \hypersetup{colorlinks,linkcolor=,urlcolor=red} % Uncomment next 2 lines instead of the first for article-style handout: % \documentclass[12pt]{article} % \usepackage{beamerarticle} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom % \usetheme{Berlin} % Diplays sections on top % \usetheme{Warsaw} % Diplays sections on top % \usetheme{Frankfurt} % Diplays sections on top: Fairly thin but swallows some material at bottom of crowded slides \usepackage[english]{babel} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Likelihood 2: Wald (and Score) Tests\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} Davison Chapter 4, especially Sections 4.3 and 4.4 \end{frame} \begin{frame}{Vector of MLEs is Asymptotically Normal}{That is, Multivariate Normal} This yields \begin{itemize} \item Confidence intervals \item $Z$-tests of $H_0: \theta_j=\theta_0$ \item Wald tests \item Score Tests \item Indirectly, the Likelihood Ratio tests \end{itemize} \end{frame} \begin{frame}{Under Regularity Conditions}{(Thank you, Mr. Wald)} \begin{itemize} \item $\widehat{\boldsymbol{\theta}}_n \stackrel{a.s.}{\rightarrow} \boldsymbol{\theta}$ \item $\sqrt{n}(\widehat{\boldsymbol{\theta}}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N_k\left(\mathbf{0}, \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$ \item So we say that $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$. \item $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ is the Fisher Information in one observation. \item A $k \times k$ matrix \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \end{displaymath} \item The Fisher Information in the whole sample is $n\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ \end{itemize} \end{frame} \begin{frame}{$H_0: \mathbf{L}\boldsymbol{\theta}= \mathbf{h}$} Suppose $\boldsymbol{\theta} = (\theta_1, \ldots \theta_7)$, and the null hypothesis is \begin{itemize} \item $\theta_1=\theta_2$ \item $\theta_6=\theta_7$ \item $\frac{1}{3}\left(\theta_1+\theta_2+\theta_3\right) = \frac{1}{3}\left(\theta_4+\theta_5+\theta_6\right)$ \end{itemize} We can write null hypothesis in matrix form as \begin{displaymath} \left[ \begin{array}{r r r r r r r} 1 & -1 & ~0 & ~0 & ~0 & ~0 & ~0 \\ 0 & 0 & 0 & 0 & 0 & 1 & -1 \\ 1 & 1 & 1 & -1 & -1 & -1 & 0 \\ \end{array} \right] \left[ \begin{array}{r} \theta_1 \\ \theta_2 \\ \theta_3 \\ \theta_4 \\ \theta_5 \\ \theta_6 \\ \theta_7 \end{array} \right] = \left[ \begin{array}{r} 0 \\ 0 \\ 0 \end{array} \right] \end{displaymath} \end{frame} \begin{frame}{Suppose $H_0: \mathbf{L}\boldsymbol{\theta}= \mathbf{h}$ is True, and $\widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n \stackrel{p}{\rightarrow} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$} By Slutsky 6a (Continuous mapping), \begin{displaymath} \sqrt{n}(\mathbf{L}\widehat{\boldsymbol{\theta}}_n - \mathbf{L}\boldsymbol{\theta}) = \sqrt{n}(\mathbf{L}\widehat{\boldsymbol{\theta}}_n - \mathbf{h}) \stackrel{d}{\rightarrow} \mathbf{LT} \sim N_k\left(\mathbf{0}, \mathbf{L} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1} \mathbf{L}^\prime \right ) \end{displaymath} and \begin{displaymath} \widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n^{-1} \stackrel{p}{\rightarrow} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}. \end{displaymath} Then by Slutsky's (6c) Stack Theorem, \begin{displaymath} \left( \begin{array}{c} \sqrt{n}(\mathbf{L}\widehat{\boldsymbol{\theta}}_n - \mathbf{h}) \\ \widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n^{-1} \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{c} \mathbf{LT} \\ \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1} \end{array} \right). \end{displaymath} Finally, by Slutsky 6a again, \begin{eqnarray*} W_n &=& n(\mathbf{L}\widehat{\boldsymbol{\theta}}-\mathbf{h})^\prime (\mathbf{L} \widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n^{-1} \mathbf{L}^\prime)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}-\mathbf{h}) \\ & \stackrel{d}{\rightarrow} & W = (\mathbf{LT}-\mathbf{0})^\prime (\mathbf{L} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1} \mathbf{L}^\prime)^{-1} (\mathbf{LT}-\mathbf{0}) \sim \chi^2(r) \end{eqnarray*} \end{frame} \begin{frame}{The Wald Test Statistic} {$W_n = n(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime (\mathbf{L} \widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n^{-1} \mathbf{L}^\prime)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})$} \begin{itemize} \item Again, null hypothesis is $H_0: \mathbf{L}\boldsymbol{\theta}= \mathbf{h}$ \item Matrix $\mathbf{L}$ is $r \times k$, $r \leq k$, rank $r$ \item All we need is a consistent estimator of $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ \item $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}})$ would do \item But it's inconvenient \item Need to compute partial derivatives and expected values in \end{itemize} \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})]\right] \end{displaymath} \end{frame} \begin{frame}{Observed Fisher Information} \begin{itemize} \item To find $\widehat{\boldsymbol{\theta}}_n$, minimize the minus log likelihood. \item Matrix of mixed partial derivatives of the minus log likelihood is \begin{displaymath} \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right] = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \sum_{i=1}^n \log f(Y_i;\boldsymbol{\theta}) \right] \end{displaymath} \item So by the Strong Law of Large Numbers, \begin{eqnarray*} \boldsymbol{\mathcal{J}}_n(\boldsymbol{\theta}) &=& \left[\frac{1}{n}\sum_{i=1}^n -\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y_i;\boldsymbol{\theta}) \right] \\ &\stackrel{a.s.}{\rightarrow}& \left[E\left(-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})\right)\right] = \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) \end{eqnarray*} \end{itemize} \end{frame} \begin{frame} {A Consistent Estimator of $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$} {Just substitute $\widehat{\boldsymbol{\theta}}_n$ for $\boldsymbol{\theta}$} \begin{eqnarray*} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n) &=& \left[\frac{1}{n}\sum_{i=1}^n -\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y_i;\boldsymbol{\theta}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \\ &\stackrel{a.s.}{\rightarrow}& \left[E\left(-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y;\boldsymbol{\theta})\right)\right] = \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) \end{eqnarray*} \begin{itemize} \item Convergence is believable but not trivial to show. \item Now we have a consistent estimator, more convenient than $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_n)$: Use $\widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n = \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)$ \end{itemize} \end{frame} \begin{frame}{Approximate the Asymptotic Covariance Matrix} \begin{itemize} \item Asymptotic covariance matrix of $\widehat{\boldsymbol{\theta}}_n$ is $\frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$. \vspace{5mm} \item Approximate it with \begin{eqnarray*} \widehat{\mathbf{V}}_n &=& \frac{1}{n} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)^{-1} \\ &=& \frac{1}{n}\left( \frac{1}{n}\left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \\ &=& \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1} \end{eqnarray*} \end{itemize} \end{frame} \begin{frame}{Compare}{Hessian and (Estimated) Asymptotic Covariance Matrix} \begin{itemize} \item $\widehat{\mathbf{V}}_n = \left( \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n} \right)^{-1}$ \item Hessian at MLE is $\mathbf{H} = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y}) \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}$ \item So to estimate the asymptotic covariance matrix of $\boldsymbol{\theta}$, just invert the Hessian. \item The Hessian is usually available as a by-product of numerical search for the MLE. \end{itemize} \end{frame} \begin{frame}{Connection to Numerical Optimization} \begin{itemize} \item Suppose we are minimizing the minus log likelihood by a direct search. \item We have reached a point where the gradient is close to zero. Is this point a minimum? \item The Hessian is a matrix of mixed partial derivatives. If all its eigenvalues are positive at a point, the function is concave up there. \item ItŐs \emph{the} multivariable second derivative test. \item The Hessian at the MLE is exactly the observed Fisher information matrix. \item Partial derivatives are often approximated by the slopes of secant lines -- no need to calculate them symbolically. \end{itemize} \end{frame} \begin{frame}{So to find the estimated asymptotic covariance matrix} \begin{itemize} \item Minimize the minus log likelihood numerically. \item The Hessian at the place where the search stops is exactly the observed Fisher information matrix. \item Invert it to get $\widehat{\mathbf{V}}_n$. \item This is so handy that sometimes we do it even when a closed-form expression for the MLE is available. \end{itemize} \end{frame} \begin{frame}{Estimated Asymptotic Covariance Matrix $\widehat{\mathbf{V}}_n$ is Useful} \begin{itemize} \item Asymptotic standard error of $\widehat{\theta}_j$ is the square root of the $j$th diagonal element. \item Denote the asymptotic standard error of $\widehat{\theta}_j$ by $S_{\widehat{\theta}_j}$. \item Thus \begin{displaymath} Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}} \end{displaymath} is approximately standard normal. \end{itemize} \end{frame} \begin{frame}{Confidence Intervals and $Z$-tests} Have $ Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}}$ approximately standard normal, yielding \begin{itemize} \item Confidence intervals: $\widehat{\theta}_j \pm S_{\widehat{\theta}_j} z_{\alpha/2}$ \item Test $H_0: \theta_j=\theta_0$ using \begin{displaymath} Z = \frac{\widehat{\theta}_j-\theta_0}{S_{\widehat{\theta}_j}} \end{displaymath} \end{itemize} \end{frame} \begin{frame}{And Wald Tests}{Recalling $ \widehat{\mathbf{V}}_n = \frac{1}{n} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)^{-1}$} \begin{eqnarray*} W_n &=& n(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime (\mathbf{L} \widehat{\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})}_n^{-1} \mathbf{L}^\prime)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \\ &=& n(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime (\mathbf{L} \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_n)^{-1} \mathbf{L}^\prime)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \\ &=& n(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime \left(\mathbf{L} (n\widehat{\mathbf{V}}_n) \mathbf{L}^\prime\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \\ &=& n(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime \frac{1}{n}\left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\prime\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \\ &=& (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\prime \left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\prime\right)^{-1} (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \\ \end{eqnarray*} \end{frame} \begin{frame}{Score Tests}{Thank you Mr. Rao} \begin{itemize} \item $\widehat{\boldsymbol{\theta}}$ is the MLE of $\boldsymbol{\theta}$, size $k \times 1$ \item $\widehat{\boldsymbol{\theta}}_0$ is the MLE under $H_0$, size $k \times 1$ \item $\mathbf{u}(\boldsymbol{\theta}) = (\frac{\partial \ell}{\partial \theta_1}, \ldots \frac{\partial \ell}{\partial \theta_k})^\prime$ is the gradient. \item $\mathbf{u}(\widehat{\boldsymbol{\theta}})=0$ \item If $H_0$ is true, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0)$ should also be close to zero. \item Under $H_0$ for large $N$, $\mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \sim N_k(\mathbf{0},\boldsymbol{\mathcal{I}}(\boldsymbol{\theta}))$, approximately. \item And, \end{itemize} \begin{displaymath} S = \mathbf{u}(\widehat{\boldsymbol{\theta}}_0)^\prime \boldsymbol{\mathcal{J}}_n(\widehat{\boldsymbol{\theta}}_0)^{-1} \mathbf{u}(\widehat{\boldsymbol{\theta}}_0) \sim \chi^2(r) \end{displaymath} Where $r$ is the number of restrictions imposed by $H_0$ \end{frame} \begin{frame}{Three Big Tests} % \framesubtitle{Asymptotically equivalent under $H_0$} \begin{itemize} \item Score Tests: Fit just the restricted model \item Wald Tests: Fit just the unrestricted model \item Likelihood Ratio Tests: Fit Both \end{itemize} \end{frame} \begin{frame}{Comparing Likelihood Ratio and Wald} \begin{itemize} \item Asymptotically equivalent under $H_0$, meaning $(W_n-G_n) \stackrel{p}{\rightarrow} 0$ % Score too \item Under $H_1$, \begin{itemize} \item Both have approximately the same distribution (non-central chi-square) \item Both go to infinity as $n \rightarrow \infty$ \item But values are not necessarily close \end{itemize} \item Likelihood ratio test tends to get closer to the right Type I error rate for small samples. \item Wald can be more convenient when testing lots of hypotheses, because you only need to fit the model once. \item Wald can be more convenient if it's a lot of work to write the restricted likelihood. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf13}} \end{frame} \end{document}