% \documentclass[mathserif]{beamer} % Get Computer Modern math font.
\documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements
\hypersetup{colorlinks,linkcolor=,urlcolor=red}
\usetheme{AnnArbor}  % CambridgeUS Blue and yellow, Shows current section title
% \usetheme{Berlin} % Blue: Displays section titles on top
% \usetheme{Frankfurt}  % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides
\usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice!
\setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom
% \usetheme{Berlin} % Diplays sections on top
% \usetheme{Warsaw} % Diplays sections on top
% \usetheme{Frankfurt}  % Diplays sections on top: Fairly thin but swallows some material at bottom of crowded slides
\usepackage[english]{babel}
\setbeamertemplate{footline}[frame number] 

\mode<presentation>
% \mode<handout>{\setbeamercolor{background canvas}{bg=black!5}}

\title{Maximum Likelihood Part Two\footnote{See last slide for copyright information.}}
\subtitle{STA 312 Fall 2023}
\date{} % To suppress date


\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
  \titlepage
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Background Reading}
%\framesubtitle{} 

{\large
Maximum likelihood handout (see course home page)
} % End size
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Overview}
\tableofcontents
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Two more issues}  
%\framesubtitle{} 
\begin{itemize}
    \item Maximum likelihood estimates are often not available in closed form.  
    \item Multiple parameters.  
\end{itemize} \vspace{4mm}


Most real-world problems have both these features.
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{No Formula for the MLE}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{No formula for the MLE}
\framesubtitle{All we need is one example to see the problem.}  
{\small
Let $X_1, \ldots, X_n$ be independent observations from a distribution with density  
\begin{displaymath}
    f(x|\theta) = \left\{ \begin{array}{ll} % ll means left left
   \frac{1}{\Gamma(\theta)} e^{-x} x^{\theta-1}& \mbox{for $x \geq 0$}  \\
   0     & \mbox{for $x < 0$} 
   \end{array}  \right.  % Need that crazy invisible right period!
\end{displaymath}  
Where the parameter $\theta>0$.   This is a gamma with $\alpha=\theta$ and $\lambda=1$. \pause

\begin{eqnarray*}
    \frac{\partial}{\partial\theta}\ell(\theta) 
    & = & \frac{\partial}{\partial\theta} \log\left(\prod_{i=1}^n 
    \frac{1}{\Gamma(\theta)} e^{-x_i} x_i^{\theta-1} \right) \\ \pause
& = & \frac{\partial}{\partial\theta} \log\left( 
    \Gamma(\theta)^{-n} e^{-\sum_{i=1}^n x_i} \left(\prod_{i=1}^n x_i\right)^{\theta-1} \right) \\ \pause
& = & \frac{\partial}{\partial\theta} \left( 
    -n\log\Gamma(\theta) - \sum_{i=1}^n x_i + (\theta-1)\sum_{i=1}^n \log x_i \right) \\ \pause
& = & -\frac{n\Gamma^\prime(\theta)}{\Gamma(\theta)}  - 0  + \sum_{i=1}^n \log x_i
\pause \stackrel{set}{=} 0
\end{eqnarray*} 
} % End size
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Numerical MLE}
\framesubtitle{By computer}   \pause
\begin{itemize}
    \item The log likelihood defines a surface sitting over the parameter space.  
    \item It could have hills and valleys and mountains.  
    \item The value of the log likelihood is easy to compute for any given set of parameter values.  
    \item This tells you the height of the surface at that point.  
    \item Take a step uphill (blindfolded).  
    \item Are you at the top? Compute the slopes of some secant lines.  
    \item Take another step uphill.  
    \item How big a step?   Good question.  
    \item Most numerical routines \emph{minimize} a function of several variables.  
    \item So minimize the minus log likelihood.
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Multiple Parameters}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Multiple parameters}
\framesubtitle{Most real-world problems have a \emph{vector} of parameters.}\pause

  \begin{itemize}
    \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$.   \linebreak The parameters $\mu$ and $\sigma^2$ are unknown.
    
 

    \item For $i=1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i$, where  
    \begin{itemize}
        \item[] $\beta_0, \ldots, \beta_{p-1}$ are unknown constants.
        \item[] $x_{i,j}$ are known constants.
        \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables.
        \item[] $\sigma^2$ is an unknown constant.
        \item[] $y_1, \ldots, y_n$ are observable random variables.   
    \end{itemize} 
The parameters $\beta_0, \ldots, \beta_{p-1}, \sigma^2$ are unknown.
  \end{itemize}
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Multi-parameter MLE}
\framesubtitle{You know most of this.}   
\begin{itemize}
    \item Suppose there are $k$ parameters.  
    \item The plane tangent to the log likelihood should be horizontal at the MLE. \pause
    \item Partially differentiate the log likelihood (or minus log likelihood) with respect to each of the parameters.  
    \item Set the partial derivatives to zero, obtaining $k$ equations in $k$ unknowns.  
    \item Solve for the parameters, if you can.  
    \item Is it really a maximum?  
    \item There is a multivariate second derivative test.
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{The Hessian matrix}  
%\framesubtitle{} 
\begin{displaymath}
    \mathbf{H} = \left[\frac{\partial^2 (-\ell)} {\partial\theta_i\partial\theta_j}\right] 
\end{displaymath}  
\begin{itemize}
    \item If there are $k$ parameters, the Hessian is a $k \times k$ matrix whose $(i,j)$ element is $\frac{\partial^2} {\partial\theta_i\partial\theta_j} (-\ell(\boldsymbol{\theta}))$.  
    \item If the second derivatives are continuous, $\mathbf{H}$ is symmetric.  \pause
    \item If the gradient is zero at a point and $|\mathbf{H}| \neq 0$, then  
        \begin{itemize}
            \item If all eigenvalues are positive at the point, local minimum.  
            \item If all eigenvalues are negative at the point, local maximum.   
            \item If there are both positive and negative eigenvalues at the point, saddle point.
        \end{itemize}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Large-sample Theory}
\framesubtitle{Earlier results generalize to the multivariate case}   
The vector of MLEs is asymptotically normal.   That is, multivariate normal.  
\begin{center}
\includegraphics[width=2.7in]{bvn}
\end{center} 
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{The Multivariate Normal}  
% \framesubtitle{} 

The multivariate normal distribution has many nice features.   For us, the important ones are:  
\begin{itemize}
    \item It is characterized by a $k \times 1$ vector of expected values   and a $k \times k$ variance-covariance matrix.  
    \item Write $\mathbf{y} \sim N_k(\boldsymbol{\mu},\boldsymbol{\Sigma})$.  
    \item $\boldsymbol{\Sigma} = [\sigma_{i,j}]$ is a symmetric matrix with variances on the main diagonal and covariances on the off-diagonals.  
    \item All the marginals are normal.   $y_j \sim N(\mu_j,\sigma_{j,j})$. 
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{The vector of MLEs is asymptotically multivariate normal. (Thank you, Mr. Wald)}  
{\Large
\begin{displaymath}
    \widehat{\boldsymbol{\theta}}_n \stackrel{.}{\sim} N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )
\end{displaymath}  
} % End size

  \begin{itemize}
    \item Compare $\widehat{\theta}_n \stackrel{.}{\sim} N(\theta,\frac{1}{n \, I(\theta)})$. \pause
    \item $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$ is the Fisher information matrix.   
    \item Specifically, the Fisher information in one observation.   
    \item A $k \times k$ matrix
\begin{displaymath}
    \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[-E\left(\frac{\partial^2}{\partial\theta_i\partial\theta_j}
\log f(Y;\boldsymbol{\theta})\right)\right]  
\end{displaymath}
    \item The Fisher Information in the whole sample is 
    $n\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})$.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{$\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )$} \pause
%\framesubtitle{} 
\begin{itemize}
    \item Asymptotic covariance matrix of $\widehat{\boldsymbol{\theta}}_n $ is $\frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$, and of course we don't know $\boldsymbol{\theta}$.  
    \item For tests and confidence intervals, we need a good \emph{approximate} asymptotic covariance matrix,  
    \item Based on a good estimate of the Fisher information matrix. \pause
    \item $\boldsymbol{\mathcal{I}}(\widehat{\boldsymbol{\theta}}_n)$ would do.
    \item But it's inconvenient\pause: Need to compute partial derivatives and expected values in 
\begin{displaymath}
    \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j}
\log f(Y;\boldsymbol{\theta})]\right]
\end{displaymath} \pause
and then substitute $\widehat{\boldsymbol{\theta}}_n$ for $\boldsymbol{\theta}$. 
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{The observed Fisher information} \pause
% \framesubtitle{} 
Approximate
\begin{displaymath}
    \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}
  = \left[ n \, E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j}
\log f(Y;\boldsymbol{\theta})]\right]^{-1}
\end{displaymath}  \pause
with 
\begin{displaymath}
    \widehat{\mathbf{V}}_n = \left(
   \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y})
    \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}
   \right)^{-1}
\end{displaymath} \pause

\vspace{4mm}

As in the univariate case, substitute the MLE for the parameter instead of taking the expected value.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Numerical MLEs}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Compare the Hessian and (Estimated) Asymptotic Covariance Matrix} \pause
  \begin{itemize}
    \item $\widehat{\mathbf{V}}_n = \left(
   \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y})
    \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}
   \right)^{-1}$ \pause
    \item Hessian at MLE is $\mathbf{H} = \left[-\frac{\partial^2} {\partial\theta_i\partial\theta_j} \ell(\boldsymbol{\theta},\mathbf{Y})
    \right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}_n}$ \pause
    \item So to estimate the asymptotic covariance matrix of $\boldsymbol{\theta}$, just invert the Hessian. \pause
    \item The Hessian is usually available as a by-product of a numerical search for the MLE. 
    \item Because it's needed for the second derivative test.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Connection to Numerical Optimization}  
  \begin{itemize}
    \item Suppose we are minimizing the minus log likelihood by a direct search.  
    \item We have reached a point where the gradient is close to zero. Is this point a minimum?  
    \item The Hessian is a matrix of mixed partial derivatives. If all its  eigenvalues are positive at a point, the function is concave up there.  
    \item Partial derivatives are usually approximated by the slopes of secant lines -- no need to calculate them symbolically.  
    \item It's \emph{the} multivariable second derivative test.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{So to find the estimated asymptotic covariance matrix} \pause
  \begin{itemize}
    \item Minimize the minus log likelihood numerically.  
    \item The Hessian at the place where the search stops is usually available.  
    \item Invert it to get $\widehat{\mathbf{V}}_n$. \pause
    \item This is so handy that sometimes we do it even when a closed-form expression for the MLE is available.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Estimated Asymptotic Covariance Matrix $\widehat{\mathbf{V}}_n$ is Useful} \pause
  \begin{itemize}
    \item Asymptotic standard error of $\widehat{\theta}_j$ is the square root of the $j$th diagonal element.  
    \item Denote the asymptotic standard error of $\widehat{\theta}_j$ by
          $S_{\widehat{\theta}_j}$.  \pause
    \item Thus 
\begin{displaymath}
    Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}} 
\end{displaymath} 
is approximately standard normal.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Hypothesis Tests}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Confidence Intervals and $Z$-tests}  
Have $ Z_j = \frac{\widehat{\theta}_j-\theta_j}{S_{\widehat{\theta}_j}}$
approximately standard normal, yielding  
        \begin{itemize}
            \item Confidence intervals: 
            $\widehat{\theta}_j \pm S_{\widehat{\theta}_j} z_{\alpha/2}$  
            \item Test $H_0: \theta_j=\theta_0$ using   
\begin{displaymath}
    Z = \frac{\widehat{\theta}_j-\theta_0}{S_{\widehat{\theta}_j}} 
\end{displaymath}
        \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Some null hypotheses involve multiple parameters}
\framesubtitle{For example,} \pause
%{\LARGE
\begin{eqnarray*}
    H_0: & & \beta_1 = \beta_2 = \beta_3 = 0 \\ \pause
    &&\\
    H_0: & & \frac{1}{3} (\theta_1+\theta_2+\theta_3) = 
             \frac{1}{3} (\theta_4+\theta_5+\theta_6) = 
             \frac{1}{2} (\theta_7+\theta_8)
\end{eqnarray*} 
%} % End size
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Two hypothesis tests for multi-parameter problems}
\framesubtitle{They also apply to single-parameter models}  
\begin{itemize}
    \item Wald tests and likelihood ratio tests. \pause
    \item They both apply to linear null hypotheses of the form $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$ 
    \item Where $\mathbf{L}$ is an $r$ by $k$ matrix with linearly independent rows. 
    \item This kind of null hypothesis is familiar from linear regression (STA302).
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Example}
\framesubtitle{Linear regression with 4 explanatory variables}  

\begin{itemize}
    \item $\boldsymbol{\theta} = (\beta_0,\beta_1,\beta_2,\beta_3,\beta_4,\sigma^2)$ \pause
    \item $H_0: \beta_1=\beta_2=\beta_3=0$ 
    \item $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{0}$ \pause
\end{itemize}
\begin{displaymath}
     \left( \begin{array}{r r r r r r}
       0 & 1 & 0 & 0 & 0 & 0 \\
       0 & 0 & 1 & 0 & 0 & 0 \\
       0 & 0 & 0 & 1 & 0 & 0 \\
    \end{array} \right) 
    \left( \begin{array}{r}
    \beta_0 \\ \beta_1 \\ \beta_2 \\ \beta_3 \\ \beta_4 \\ \sigma^2 
    \end{array} \right) = 
    \left( \begin{array}{r}
    0 \\  0 \\ 0
    \end{array} \right) 
\end{displaymath} 
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Another example of $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$ }
\framesubtitle{A collection of linear constraints on the parameter $\boldsymbol{\theta}$} 
Example with $k=7$ parameters: $H_0$ has three parts 
\begin{itemize}
    \item $\theta_1=\theta_2$ and 
    \item $\theta_6=\theta_7$ and 
    \item $\frac{1}{3}\left(\theta_1+\theta_2+\theta_3\right) = 
\frac{1}{3}\left(\theta_4+\theta_5+\theta_6\right)$
\end{itemize}\pause

\begin{displaymath}
     \left( \begin{array}{r r r r r r r}
       1 & -1 & ~0 & ~0 & ~0 & ~0 & ~0 \\
       0 &  0 &  0 &  0 &  0 &  1 & -1 \\
       1 &  1 &  1 & -1 & -1 & -1 &  0 \\
    \end{array} \right) 
    \left( \begin{array}{r}
    \theta_1 \\  \theta_2 \\  \theta_3 \\  \theta_4 \\  
    \theta_5 \\  \theta_6 \\  \theta_7 
    \end{array} \right) = 
    \left( \begin{array}{r}
    0 \\  0 \\ 0
    \end{array} \right) 
\end{displaymath} \pause
Notice the number of rows in $\mathbf{L}$ is the number of = signs in $H_0$.
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}{Wald Test for $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$}
\framesubtitle{Based on $(\mathbf{x}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu}) \sim \chi^2 (p)$} 

{\LARGE
\begin{displaymath}
    W_n = (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top 
\left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\top\right)^{-1} 
(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})
\end{displaymath}  \pause
} % End size

\vspace{5mm}

\begin{itemize}
    \item Looks like the formula for the general linear $F$-test in regression. 
    \item Asymptotically chi-squared under $H_0$.
    \item Reject for large values of $W_n$. 
    \item $df = $ number of rows in $\mathbf{L}$. 
    \item Number of linear constraints specified by $H_0$.
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
\frametitle{The \texttt{Wtest} Function}
\framesubtitle{$ W_n = (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top 
\left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\top\right)^{-1} 
(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})$} 
% {\footnotesize % or scriptsize
\begin{verbatim}
Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
# For Wald tests based on numerical MLEs, Tn = theta-hat,
# and Vn is the inverse of the Hessian. 
     {
     value = numeric(3)
     names(value) = c("W","df","p-value")
     r = dim(L)[1]
     W = t(L%*%Tn-h) %*% solve(L%*%Vn%*%t(L)) %*%
          (L%*%Tn-h)
     W = as.numeric(W)
     pval = 1-pchisq(W,r)
     value[1] = W; value[2] = r; value[3] = pval
     return(value)
     }
\end{verbatim} 
% } % End size
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Likelihood ratio tests} 
%\framesubtitle{} 
\begin{itemize}
    \item $X_1, \ldots, X_n  \stackrel{i.i.d.}{\sim} F_\theta, \, \theta \in \Theta$ 
    \item $H_0: \theta \in \Theta_0$ v.s. $H_1: \theta \in \Theta \cap \Theta_0^c$  \pause
\end{itemize}

% herehere

\begin{eqnarray*}
    G^2 &=& -2 \log \left(   
           \frac{\max_{\theta \in \Theta_0} L(\theta)}
                {\max_{\theta \in \Theta}   L(\theta)}
           \right) \pause = 
            -2\log \frac{L(\widehat{\theta}_0)}{L(\widehat{\theta})} \\ \pause
        &=& 2 \left(\ell(\widehat{\theta}) - \ell(\widehat{\theta}_0) \right)
\end{eqnarray*}   \pause

\begin{itemize}
    \item Under $H_0$, $G^2$ has an approximate chi-squared 
distribution for large $n$. 
    \item Degrees of freedom = number of (non-redundant, linear) equalities specified by $H_0$.   
    \item Reject when $G^2$ is large.
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Example: Multinomial with 3 categories} \pause
%\framesubtitle{} 
\begin{itemize}
    \item Parameter space is 2-dimensional. \pause
    \item Unrestricted MLE is $(p_1, p_2)$: Sample proportions. \pause
    \item $H_0: \theta_1 = 2\theta_2$
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Parameter space for $H_0: \theta_1 = 2\theta_2$ }
\framesubtitle{Red dot is unrestricted MLE, Black square is restricted MLE} 
\begin{center}
\includegraphics[width=3.1in]{ParameterSpace}
\end{center}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{frame}{Comparing Likelihood Ratio and Wald tests} \pause
  \begin{itemize}
        \item Asymptotically equivalent under $H_0$, meaning 
        $(W_n-G^2_n) \stackrel{p}{\rightarrow} 0$ \pause % Score too
        \item Under $H_1$, 
            \begin{itemize}
                \item Both have the same approximate distribution (non-central chi-square). 
                \item Both go to infinity as $n \rightarrow \infty$. 
                \item But values are not necessarily close for the same data set. \pause
            \end{itemize}
        \item Likelihood ratio test tends to get closer to the right Type I error probability for small samples.
        \item Wald can be more convenient when testing lots of hypotheses, because you only need to fit the model once. 
        \item Wald can be more convenient if it's a lot of work to write the restricted likelihood.
   \end{itemize}
\end{frame}

\section{Nonlinear functions}

\begin{frame}
\frametitle{Non-linear functions of the parameter vector} \pause
%\framesubtitle{} 
\begin{itemize}
    \item Most tests are about linear combinations of the model parameters.
    \item Sometimes we want tests and confidence intervals for \emph{non-linear} functions of $\boldsymbol{\theta} \in \mathbb{R}^k$. \pause
    \item Like $\frac{\alpha}{\lambda^2}$ (variance of a gamma). \pause
    \item Fortunately, smooth functions of an asymptotically multivariate normal random vector are asymptotically normal. 
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Theorem based on the delta method of Cram\'{e}r}
\framesubtitle{The delta method is more general than this.}  \pause
Let $\boldsymbol{\theta} \in \mathbb{R}^k$.   Under the conditions for which $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \mathbf{V}_n\right)$ with $\mathbf{V}_n = \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$,   let the function $g: \mathbb{R}^k \rightarrow \mathbb{R}$ \pause be such that    the elements of  
\.{g}$(\boldsymbol{\theta}) = \left( \frac{\partial g}{\partial\theta_1}, \ldots ,  \frac{\partial g}{\partial\theta_k} \right)$  \pause are continuous in a neighbourhood of the true parameter vector $\boldsymbol{\theta}$.  \pause Then
\begin{displaymath}
    g(\widehat{\boldsymbol{\theta}}) \stackrel{.}{\sim} 
    N\left( g(\boldsymbol{\theta}), \mbox{\.{g}}(\boldsymbol{\theta}) \mathbf{V}_n \,
     \mbox{\.{g}}(\boldsymbol{\theta})^\top \right).
\end{displaymath} \pause
Note that the asymptotic variance $\mbox{\.{g}}(\boldsymbol{\theta}) \mathbf{V}_n \,
     \mbox{\.{g}}(\boldsymbol{\theta})^\top$ is a matrix product:  $(1 \times k)$ times $(k \times k)$ times $(k \times 1)$. \pause
\vspace{4mm}

The standard error of $g(\widehat{\boldsymbol{\theta}})$ is 
$\sqrt{\mbox{\.{g}}(\widehat{\boldsymbol{\theta}}) \widehat{\mathbf{V}}_n  \, \mbox{\.{g}}(\widehat{\boldsymbol{\theta}})^\top}$.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Example of \.{g}$(\boldsymbol{\theta}) = \left( \frac{\partial g}{\partial\theta_1}, \ldots ,  \frac{\partial g}{\partial\theta_k} \right)$} \pause
%\framesubtitle{} 
\begin{itemize}
    \item Variance of gamma is $g(\alpha,\lambda) = \frac{\alpha}{\lambda^2}$.
    \item $\theta_1=\alpha$, $\theta_2= \lambda$, $k = 2$, 
    \item So \.{g}$(\boldsymbol{\theta})$ is $1 \times 2$.  \pause
\end{itemize}
\begin{eqnarray*}
    \mbox{\.{g}} & = & \left( \frac{\partial g}{\partial\alpha}, \frac{\partial g}{\partial\lambda} \right) \\ \pause
     & = & \left( \frac{1}{\lambda^2}, \, \alpha (-2)\lambda^{-3}  \right) \\ \pause
     & = & \left( \frac{1}{\lambda^2}, \, \frac{-2\alpha}{\lambda^3}  \right) \pause
\end{eqnarray*} 
Then, $\mbox{\.{g}}(\widehat{\boldsymbol{\theta}}) \widehat{\mathbf{V}}_n  \, \mbox{\.{g}}(\widehat{\boldsymbol{\theta}})^\top$ is easy if you have $\widehat{\mathbf{V}}_n$.

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Copyright Information}

This slide show was prepared by  \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner},
Department of Statistics, University of Toronto. It is licensed under a 
\href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US}
     {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website:
\href{http://www.utstat.toronto.edu/brunner/oldclass/312f23} {\footnotesize \texttt{http://www.utstat.toronto.edu/brunner/oldclass/312f23}}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\end{document}

# Bivariate Normal Plot
y <- numeric(900)
dim(y) <- c(30,30)
i <- 0
for (x1 in seq(-3,3,length=30) )
{
   i <- i+1
   j <- 0
   for (x2 in seq(-3,3,length=30) )
   {
      j <- j+1
      y[i,j] <- 1/(2*pi) * exp( -.5*( x1^2 + x2^2) )
   }
}
y <- y/max(y)
persp(y)

# Plotting jobs parameter space with R
theta1 = seq(from=0,to=1,by=0.05); theta2=theta1
plot(theta1,theta2,pch=' ', frame.plot=F,
     xlab=expression(theta[1]), ylab=expression(theta[2])) 
# Draw boundaries of parameter space
xloc1 = c(0,0); yloc1 = c(0,1); lines(xloc1,yloc1,lty=1)
xloc2 = c(0,1); yloc2 = c(0,0); lines(xloc2,yloc2,lty=1)
xloc3 = c(0,1); yloc3 = c(1,0); lines(xloc3,yloc3,lty=1)
# Restricted parameter space is a line segment
xloc4 = c(0,2/3); yloc4 = c(0,1/3); lines(xloc4,yloc4,lty=2)
points(0.53,0.37, pch=19, col = "red1") # Unrestricted MLE
points(0.60,0.30, pch=23, bg="black") # Restricted MLE


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Specializing the delta method to the case of a single parameter}  \pause
\framesubtitle{Yielding the univariate delta method}
Let $\boldsymbol{\theta} \in \mathbb{R}$. \pause Under the conditions for which $\widehat{\theta}_n$ is asymptotically $N\left(\theta, v_n\right)$ with $v_n = \frac{1}{n} \, I(\theta)$, \pause let the function $g(x)$ have a continuous derivative in a neighbourhood of the true parameter $\theta$. \pause Then
\begin{displaymath}
    g(\widehat{\theta}) \stackrel{.}{\sim} 
    N\left( g(\theta), g^\prime(\theta)^2 \, v_n \right).
\end{displaymath} \pause

\vspace{4mm}

The standard error of $g(\widehat{\theta})$ is 
$\sqrt{ g^\prime(\widehat{\theta})^2 \, \widehat{v}_n}$\pause, or
$\left|g^\prime(\widehat{\theta}) \right|\sqrt{\widehat{v}_n}$
\end{frame}