\documentclass[serif]{beamer} % Get Computer Modern math font. \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{SAS \texttt{proc calis}: The basics\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{The program} \begin{frame} \frametitle{What it is and what it does} %\framesubtitle{} \begin{itemize} \item SAS \texttt{proc calis} is model fitting software. \item It fits classical structural equation models to data, using numerical maximum likelihood (or optionally, other methods). \item Most of the output is about the details of the numerical search and how well the model fits. \item This is a narrow focus, compared to most other SAS procedures. \item Still, SAS tells you more than you need or want to know --- as usual. \end{itemize} \end{frame} \begin{frame} \frametitle{Three programs} %\framesubtitle{} \begin{itemize} \item \texttt{proc calis} incorporates three programs that originated outside of SAS. \item They all use different, unrelated syntax for specifying the model. \item We will use the \texttt{lineqs}\footnote{Bentler and Weeks, \emph{British Journal of Mathematical and Statistical Psychology, 1980.}} syntax, which is the most convenient. \item[] \item First, read and label the data as usual in a SAS \emph{data step}. \end{itemize} \end{frame} \begin{frame} \frametitle{Specifying the model} \framesubtitle{Using \texttt{lineqs} syntax} Input includes: \begin{itemize} \item Names of the observable variables. \item Model equations, pretty much as you would write them by hand \begin{itemize} \item Including the regression coefficients and the error terms -- you name them. \item No intercepts: The model is given in centered form and SAS bases everything on the sample covariance matrix. \item Naming rules: Names of latent variables (including error terms) must begin with the letter F, D or E. \end{itemize} \item Names must also be given to the variances and covariances of the explanatory variables and error terms. Anything unspecified is assumed zero. \item In the end, you give names to \emph{all} the non-zero parameters in your model. \end{itemize} \end{frame} \begin{frame} \frametitle{What happened to the intercepts?} %\framesubtitle{} {\footnotesize \begin{displaymath} L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \end{displaymath} } % End size \begin{itemize} \item Remember, $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ are both functions of $\boldsymbol{\theta}$. \item For regression without measurement error, expected values and intercepts are identifiable, but if there are latent variables that's rare. \item Re-parameterize, absorbing expected values and intercepts into $\boldsymbol{\mu}$. \item Estimate $\boldsymbol{\mu}$ with $\overline{\mathbf{x}}$ and it's gone. \item This is just a technical trick to allow the likelihood to have a unique maximum. \item But it does no harm, because \emph{relationships} between variables are represented by the covariances. \end{itemize} \end{frame} \section{Maximum likelihood} \begin{frame} \frametitle{Maximum likelihood} %\framesubtitle{} \begin{eqnarray*} L(\boldsymbol{\Sigma}) & = & |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr\left(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}^{-1}\right) \right\} \\ && \\ L_2(\boldsymbol{\theta}) & = & |\boldsymbol{\Sigma}(\boldsymbol{\theta})|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr\left(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}(\boldsymbol{\theta})^{-1}\right) \right\} \end{eqnarray*} \begin{itemize} \item Can maximize $L(\boldsymbol{\Sigma})$ over all $\boldsymbol{\Sigma} \in \mathcal{M}$, or maximize $L_2(\boldsymbol{\theta})$ over all $\boldsymbol{\theta} \in \Theta$. \item If the function connecting $\boldsymbol{\Sigma}$ and $\boldsymbol{\theta}$ is one-to-one and there is the same number of $\boldsymbol{\theta}$ and unique $\boldsymbol{\Sigma}$ values, call the parameter $\boldsymbol{\theta}$ \emph{just identifiable}. \item In this case it's the same problem, and \item The invariance principle can be used to go back and forth between $\widehat{\boldsymbol{\Sigma}}$ and $\widehat{\boldsymbol{\theta}}$. \item Otherwise \ldots \end{itemize} \end{frame} \begin{frame} \frametitle{Maximize $L_2(\boldsymbol{\theta})$ over all $\boldsymbol{\theta} \in \Theta$} %\framesubtitle{} %{\footnotesize \begin{displaymath} L_2(\boldsymbol{\theta}) = |\boldsymbol{\Sigma}(\boldsymbol{\theta})|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr\left(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}(\boldsymbol{\theta})^{-1}\right) \right\} \end{displaymath} %} % End size \begin{itemize} \item Actually, maximize the \emph{log} likelihood. \item Well, actually, minimize the minus 2 log likelihood. \item Well, actually, minimize the minus 2 log likelihood plus a carefully chosen constant. \item[] \item The constant is based on the likelihood ratio test for goodness of model fit. \end{itemize} \end{frame} \begin{frame} \frametitle{Likelihood ratio tests} \framesubtitle{In general} Setup \begin{displaymath} \begin{array}{l} Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} P_\theta, \, \theta \in \Theta, \\ H_0: \theta \in \Theta_0 \subset \Theta \mbox{ v.s. } H_1: \theta \in \Theta_1 = \Theta \cap \Theta_0^c \\ \end{array} \end{displaymath} \vspace{5mm} Test Statistic: \begin{displaymath} G^2 = -2 \ln \left( \frac{\max_{\theta \in \Theta_0} L(\theta)} {\max_{\theta \in \Theta} L(\theta)} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{What to do} \framesubtitle{And how to think about it} \begin{displaymath} G^2 = -2 \ln \left( \frac{\max_{\theta \in \Theta_0} L(\theta)} {\max_{\theta \in \Theta} L(\theta)} \right) \end{displaymath} \begin{itemize} \item Maximize the likelihood over the whole parameter space. You already did this to calculate the MLE. Evaluate the likelihood there. That's the denominator. \item Maximize the likelihood over just the parameter values where $H_0$ is true -- that is, over $\Theta_0$. This yields a restricted MLE. Evaluate the likelihood there. That's the numerator. \item The numerator cannot be larger, because $\Theta_0 \subset \Theta$. \item If the numerator is a \emph{lot} less than the denominator, the null hypothesis is unbelievable, and \begin{itemize} \item The ratio is close to zero \item The log of the ratio is a big negative number \item $-2$ times the log is a big positive number \item Reject $H_0$ when $G^2$ is large enough. \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Distribution of $G^2$ when $H_0$ is true} Given some technical conditions, \begin{itemize} \item $G^2$ has an approximate chi-squared distribution under $H_0$ for large $n$. \item Degrees of freedom equal number of (non-redundant) equalities specified by $H_0$. \item Reject $H_0$ when $G^2$ is larger than the chi-squared critical value. \end{itemize} \end{frame} \section{Goodness of fit test} \begin{frame} \frametitle{Goodness of fit test for a covariance structure model} \framesubtitle{Multivariate normal data} Call it a ``covariance structure" model because $\boldsymbol{\Sigma} = \boldsymbol{\Sigma}(\boldsymbol{\theta})$. \begin{itemize} \item Compare fit of model to fit of the \emph{best possible} model. \item The best possible model is the unrestricted multivariate normal: \begin{itemize} \item Estimate $\boldsymbol{\mu}$ with $\overline{\mathbf{x}}$. \item Estimate $\boldsymbol{\Sigma}$ with $\widehat{\boldsymbol{\Sigma}}$. \end{itemize} \item Covariance structure model is re-parameterized to get rid of intercepts, so again, $\boldsymbol{\mu}$ is estimated with $\overline{\mathbf{x}}$. \item Compare \end{itemize} \begin{displaymath} \ln L\left(\widehat{\boldsymbol{\Sigma}}\right) \mbox{ to } \ln L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Likelihood ratio test} \framesubtitle{For goodness of model fit} Difference in fit (times two): \begin{eqnarray*} G^2 &=& 2 \left( \ln L\left(\widehat{\boldsymbol{\Sigma}}\right) - \ln L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right) \right) \\ & = & -2 \ln \left( \frac{L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right) } {L\left(\widehat{\boldsymbol{\Sigma}}\right)} \right)\\ \end{eqnarray*} It looks like a likelihood ratio test statistic. \end{frame} \begin{frame} \frametitle{More details} %\framesubtitle{} \begin{displaymath} G^2 = -2 \ln \left( \frac{L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right) } {L\left(\widehat{\boldsymbol{\Sigma}}\right)} \right) \end{displaymath} If the covariance structure model is correct and \begin{itemize} \item The parameter vector is identifiable, and \item There are more unique variances and covariances in $\boldsymbol{\Sigma}$ than there are model parameters in $\boldsymbol{\theta}$, and \item Some other technical conditions hold \end{itemize} Then for large samples, $G^2$ has an approximate chi-squared distribution, with degrees of freedom the number of variances-covariances \emph{minus} the number of model parameters. \end{frame} \begin{frame} \frametitle{Simplify $G^2 = 2 \left( \ln L\left(\widehat{\boldsymbol{\Sigma}}\right) - \ln L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right) \right)$} %\framesubtitle{} Recalling $L(\boldsymbol{\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr\left(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}^{-1}\right) \right\}$, \begin{eqnarray*} G^2 & = & -2 \ln L(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})) - [-2 \ln L(\widehat{\boldsymbol{\Sigma}})] \\ \\ & = & n\left( tr(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})^{-1}) + \ln |\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})| - \ln |\widehat{\boldsymbol{\Sigma}}| - p \right) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{A cute way to maximize the likelihood over $\boldsymbol{\theta} \in \Theta$} %\framesubtitle{} \begin{itemize} \item Minimize $G^2(\boldsymbol{\theta})$: Just -2 log likelihood plus a constant. \begin{eqnarray*} G^2(\boldsymbol{\theta}) & = & -2 \ln L(\boldsymbol{\Sigma}(\boldsymbol{\theta})) - [-2 \ln L(\widehat{\boldsymbol{\Sigma}})] \\ \\ & = & n\left( tr(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}(\boldsymbol{\theta})^{-1}) + \ln |\boldsymbol{\Sigma}(\boldsymbol{\theta})| - \ln |\widehat{\boldsymbol{\Sigma}}| - p \right) \end{eqnarray*} \item Actually, minimize the ``Objective Function" \begin{displaymath} tr(\widehat{\boldsymbol{\Sigma}} \boldsymbol{\Sigma}(\boldsymbol{\theta})^{-1}) + \ln |\boldsymbol{\Sigma}(\boldsymbol{\theta})| - \ln |\widehat{\boldsymbol{\Sigma}}| - p \end{displaymath} \item Multiply by $n$ (or $n-1$) to get the $G^2$ statistic. \item This is what SAS \texttt{proc calis} does. \end{itemize} \end{frame} \begin{frame} \frametitle{Saturated models} \framesubtitle{All the degrees of freedom in the data are ``soaked up" by the model.} \begin{itemize} \item If there are the same number of moment structure equations and unknown parameters and the parameter is identifiable, there is a one-to-one function between $\widehat{\boldsymbol{\Sigma}}$ and $\widehat{\boldsymbol{\theta}}$. \item In this case the parameter is called \emph{just identifiable}. \item $L\left(\widehat{\boldsymbol{\Sigma}}\right) = L\left(\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})\right)$ \item $G^2=0, df=0$ and the standard test for goodness of fit does not apply. \item The model may still be testable some other way. \end{itemize} \end{frame} \section{What we get} \begin{frame} \frametitle{What does \texttt{proc calis} give us?} \begin{itemize} \item An indication of whether the numerical search went okay. \item MLEs of all the parameters, standard errors and $Z$ tests of $H_0: \theta_j=0$. \item The -2 log likelihood at the MLE, plus a constant. \item Likelihood ratio test for goodness of fit. \end{itemize} \end{frame} \begin{frame} \frametitle{With the -2 log likelihood at the MLE (plus a constant) we can} %\framesubtitle{} \begin{itemize} \item Fit a full and a reduced model. \item Test null hypothesis that the reduced model holds, using a LR test. \item $G^2$ is a difference between two -2 log likelihoods. \item The constant ($-2\ln L(\widehat{\boldsymbol{\Sigma}})$) cancels. \item[] \item This is all we really need. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s31}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%