% \documentclass[mathserif]{beamer} % Get Computer Modern math font.
\documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements
\hypersetup{colorlinks,linkcolor=,urlcolor=red}
\usetheme{Berlin} % Displays sections on top
\usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice!
\setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom
% \usetheme{Berlin} % Diplays sections on top
% \usetheme{Warsaw} % Diplays sections on top
% \usetheme{Frankfurt}  % Diplays sections on top: Fairly thin but swallows some material at bottom of crowded slides
\usepackage[english]{babel}
\usepackage{alltt}
\setbeamertemplate{footline}[frame number] 

\mode<presentation>
% \mode<handout>{\setbeamercolor{background canvas}{bg=black!5}}

\title{Testing Null Hypotheses\footnote{See last slide for copyright information.}}
\subtitle{STA431 Spring}
\date{} % To suppress date

% Cut out a lot of detail in 2014: see 2013 version

\begin{document}

\begin{frame}
  \titlepage
\end{frame}


\begin{frame}{Vector of MLEs is Asymptotically Normal}{That is, Multivariate Normal} 
\begin{equation*}
    \sqrt{n}(\widehat{\boldsymbol{\theta}}_n-\boldsymbol{\theta})
\stackrel{d}{\rightarrow} \mathbf{t} \sim N_k\left(\mathbf{0}, \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}\right )
\end{equation*}

\vspace{5mm}

Approximating the asymptotic covariance matrix $\frac{1}{n}\boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$ with $\widehat{\mathbf{V}}_n = \mathbf{H}^{-1}$ yields confidence intervals for the parameters, and 
  \begin{itemize}
    \item $Z$-tests
    \item Wald tests.
    \item Indirectly, Likelihood Ratio tests.
  \end{itemize}
\end{frame}


\begin{frame}{$Z$-tests} 
Have $ Z_j = \frac{\widehat{\theta}_j-\theta_j}{se_{\widehat{\theta}_j}}$
approximately standard normal, where $se_{\widehat{\theta}_j}$ is the square root of the $j$th diagonal element of $\widehat{\mathbf{V}}_n$.  \vspace{4mm} \pause

Test $H_0: \theta_j=\theta_0$ using

\begin{displaymath}
    Z = \frac{\widehat{\theta}_j-\theta_0}{se_{\widehat{\theta}_j}} 
\end{displaymath}
\end{frame}

\begin{frame}{And Wald Tests for $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$}
\framesubtitle{Based on $(\mathbf{x}-\boldsymbol{\mu})^\top
                 \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu}) \sim \chi^2 (p)$} 

{\LARGE
\begin{displaymath}
    W_n = (\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top 
\left(\mathbf{L} \widehat{\mathbf{V}}_n \mathbf{L}^\top\right)^{-1} 
(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})
\end{displaymath}  \pause
} % End size

\vspace{5mm}

$\widehat{\boldsymbol{\theta}}_n \stackrel{\cdot}{\sim} N_p(\boldsymbol{\theta},\mathbf{V_n})$ 
so if $H_0$ is true,  $\mathbf{L}\widehat{\boldsymbol{\theta}}_n \stackrel{\cdot}{\sim}
N_r(\mathbf{h},\mathbf{L} \mathbf{V}_n \mathbf{L}^\top)$. \pause

Thus $(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h})^\top 
\left(\mathbf{L} \mathbf{V}_n \mathbf{L}^\top\right)^{-1} 
(\mathbf{L}\widehat{\boldsymbol{\theta}}_n-\mathbf{h}) \stackrel{\cdot}{\sim} \chi^2(r)$. \pause

Substitute $\widehat{\mathbf{V}}_n$ for $\mathbf{V}_n$.

\end{frame}

\begin{frame}[fragile]
\frametitle{The Wtest function}
\framesubtitle{\texttt{source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt")}} 
{\footnotesize % or scriptsize
\begin{verbatim}
Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
# Tn is estimated theta, usually a vector.
# Vn is the estimated asymptotic covariance matrix of Tn.  
# For Wald tests based on numerical MLEs, Tn = theta-hat,
# and Vn is the inverse of the Hessian of the minus log 
# likelihood. 
     {
     Wtest = numeric(3)
     names(Wtest) = c("W","df","p-value")
     r = dim(L)[1]
     W = t(L%*%Tn-h) %*% solve(L%*%Vn%*%t(L)) %*%
          (L%*%Tn-h)
     W = as.numeric(W)
     pval = 1-pchisq(W,r)
     Wtest[1] = W; Wtest[2] = r; Wtest[3] = pval
     Wtest
     } # End function Wtest
\end{verbatim} 
} % End size
\end{frame}

\begin{frame}
\frametitle{Likelihood Ratio Tests}
%\framesubtitle{} 
\begin{displaymath}
\begin{array}{l}
     x_1, \ldots, x_n  \stackrel{i.i.d.}{\sim} F_\theta, \, \theta \in \Theta,  \\
     H_0: \theta \in \Theta_0 \mbox{ v.s. } 
     H_A: \theta \in \Theta \cap \Theta_0^c, \\
\end{array} 
\end{displaymath} 
%\vspace{3mm}

\begin{eqnarray*}
    G^2 &=& -2 \ln \left(   
           \frac{\max_{\theta \in \Theta_0} L(\theta)}
                {\max_{\theta \in \Theta}   L(\theta)}
           \right) \\
        &=& -2\ln\left(\frac{L(\widehat{\theta}_0)}{L(\widehat{\theta})} \right)
\end{eqnarray*} \pause
%\vspace{3mm}

Under $H_0$, $G^2$ has an approximate chi-square 
distribution for large $n$. Degrees of freedom = number of (non-redundant, linear) equalities specified by $H_0$.  Reject when $G^2$ is large.
\end{frame}


\begin{frame}
\frametitle{Example: Testing $H_0:\alpha=\beta$ for data from a Gamma distribution} 
\framesubtitle{$H_0: \theta \in \Theta_0$ v.s.~$H_A: \theta \in \Theta \cap \Theta_0^c$} 
\begin{columns}  
\column{0.5\textwidth}
{\small
\begin{itemize}
    \item  $\Theta = \{(\alpha,\beta): \alpha>0, \beta>0 \}$
    \item  $\Theta_0 = \{(\alpha,\beta): \alpha=\beta>0 \}$
\end{itemize}
} % End size 
\pause
\column{0.6\textwidth}
\includegraphics[width=2.5in]{MLEplot}
\end{columns}
\end{frame}

\begin{frame}[fragile]
\frametitle{Functions}
\framesubtitle{$-\ell(\alpha,\beta) = n\alpha\ln\beta + n\ln\Gamma(\alpha) 
                        +  \frac{1}{\beta}\sum_{i=1}^n x_i
                        - (\alpha - 1) \sum_{i=1}^n \ln x_i$} 
{\footnotesize
\begin{columns} % Use Beamer's columns to use more of the margins
\column{1.1\textwidth}
\begin{verbatim}
gmll = function(theta,datta) 
     {
       aa =  theta[1]; bb =  theta[2]
       nn = length(datta); sumd = sum(datta)
       sumlogd = sum(log(datta))
       value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd
       return(value)
     } # End function gmll


# gmll0 is minus LL gamma log likelihood with alpha=beta
gmll0 = function(alpha,datta) gmll(c(alpha,alpha),datta)
\end{verbatim}
\end{columns}

} % End size
\end{frame}

\begin{frame}[fragile]      % alltt is usually better, but not this time
\frametitle{}
%\framesubtitle{} 
{\scriptsize
{\color{blue}
\begin{verbatim}
> # Unrestricted MLE
> gsearch = optim(par=c(momalpha,mombeta), fn = gmll, 
+                 method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d)  }
> gsearch
\end{verbatim}
} % End color
\begin{verbatim}
$par
[1] 1.805930 3.808674

$value
[1] 142.0316

$counts
function gradient 
       9        9 

$convergence
[1] 0

$message
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

$hessian
         [,1]      [,2]
[1,] 36.69402 13.127928
[2,] 13.12793  6.224773
\end{verbatim} 
} % End size
\end{frame}


\begin{frame}[fragile]
\frametitle{Restricted MLE}
\framesubtitle{Restricted by $H_0: \alpha=\beta$} 
{\scriptsize
% The alltt environment requires  \usepackage{alltt} 
\begin{alltt}
{\color{blue}> gsearch0 = optim(par=mean(thetahat), fn = gmll0, 
+                 method = "L-BFGS-B", lower = 0,  datta=d)
> gsearch0 }
$par
[1] 2.562371

$value
[1] 144.1704

$counts
function gradient 
       6        6 

$convergence                                                                                            
[1] 0

$message                                                                                            $
[1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
\end{alltt}
} % End size
\end{frame}

\begin{frame}[fragile]
\frametitle{Likelihood Ratio Test}
\framesubtitle{$G^2 = -2\ln\left(\frac{L(\widehat{\theta}_0)}{L(\widehat{\theta})} \right) = 
2\left( -\ln L(\widehat{\theta}_0) - (-\ln L(\widehat{\theta})) \right)$} \pause
%{\scriptsize
% The alltt environment requires  \usepackage{alltt} 
\begin{alltt}
{\color{blue}> Gsq = 2*(gsearch0$value-gsearch$value); Gsq }
[1] 4.277603
{\color{blue}> pval = 1-pchisq(Gsq,df=1); pval }
[1] 0.03861777
\end{alltt} \pause
\vspace{3mm}

\begin{alltt}
{\color{blue}> thetahat }
alpha-hat  beta-hat 
 1.805930  3.808674 
\end{alltt}

%} % End size
\end{frame}


\begin{frame}[fragile]
\frametitle{Wald test for comparison}
\framesubtitle{Likelihood ratio test yielded $G^2 = 4.278$, $p = 0.0386$} 
{\footnotesize % or scriptsize
\begin{columns} % Use Beamer's columns to use more of the margins
\column{1.1\textwidth}
% The alltt environment requires  \usepackage{alltt} 
\begin{alltt}
{\color{blue}> source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt")
> LL = rbind(c(1,-1))
> Wtest(LL,Tn=thetahat,Vn=Vhat_n) }
         W         df    p-value 
3.25110020 1.00000000 0.07137553 
\end{alltt}
\end{columns}

} % End size
\end{frame}

\begin{frame}{Comparing Likelihood Ratio and Wald Tests in General} 
  \begin{itemize}
        \item Asymptotically equivalent under $H_0$, meaning  
        $(W_n-G^2_n) \stackrel{p}{\rightarrow} 0$ \pause % Score too
        \item Under the alternative hypothesis, 
            \begin{itemize}
                \item Both have the same approximate distribution (non-central chi-square). 
                \item Both go to infinity as $n \rightarrow \infty$.
                \item But values are not necessarily close. 
            \end{itemize} \pause
        \item Likelihood ratio test tends to get closer to the right Type I error probability for small samples. 
        \item Wald can be more convenient when testing lots of hypotheses, because you only need to fit the model once. 
        \item Wald can be more convenient if it's a lot of work to write the restricted likelihood.
   \end{itemize}
\end{frame}


\begin{frame}[fragile]
\frametitle{$Z$-test of $H_0: \beta = 3$}
%\framesubtitle{} 
{\footnotesize % or scriptsize
\begin{columns} % Use Beamer's columns to use more of the margins
\column{1.1\textwidth}
% The alltt environment requires  \usepackage{alltt} 
\begin{alltt}
{\color{blue}> se = sqrt(Vhat_n[2,2])
> # Assignning names because otherwise everything is labelled "betahat"
> z = (thetahat[2]-3)/se; names(z) = "Z statistic"; z }
Z statistic 
  0.9996297 
{\color{blue}pval = 2*(1-pnorm(abs(z))); names(pval) = "p-value"; pval }
  p-value 
0.3174897 
\end{alltt}
\end{columns}
} % End size
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{frame}
\frametitle{Copyright Information}

This slide show was prepared by  \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner},
Department of Statistical Sciences, University of Toronto. It is licensed under a 
\href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US}
     {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website:
\href{http://www.utstat.toronto.edu/brunner/oldclass/431s23} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}}

\end{frame}

\end{document}