% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usetheme{AnnArbor} % CambridgeUS % I'm using this one (yellow) just to be different from Dehan. \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{graphpap} % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Including Measurement Error in the Regression Model: A First Try\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Moment Structure Equations} \begin{frame} \frametitle{Moments and Moment Structure Equations} Model $d \sim P_\theta$ \vspace{10mm} \begin{itemize} \item \emph{Moments} of a distribution are quantities such $E(X)$, $E(Y^2)$, $Var(X)$, $E(X^2Y^2)$, $Cov(X,Y)$, and so on. \pause \item \emph{Moment structure equations} are a set of equations expressing moments of the distribution of the observable data in terms of the model parameters. \pause ~~~~~ $m = g(\theta)$ \pause \item If the moments involved are limited to variances and covariances, the moment structure equations are called \emph{covariance structure equations}. \end{itemize} \end{frame} \begin{frame} \frametitle{Important process} \begin{itemize} \item Calculate the moments of the distribution (usually means, variances and covariances) in terms of the model parameters, obtaining a system of moment structure equations. ~~~~~ $m = g(\theta)$. \pause \item Solve the moment structure equations for the parameters, expressing the parameters in terms of the moments. ~~~~~ $\theta = g^{-1}(m)$. \pause \item Method of Moments: $\widehat{\theta} = g^{-1}(\widehat{m})$ \pause \item By LLN and Continuous mapping, $\widehat{\theta}\stackrel{p}{\rightarrow} \theta$ \pause \item So even if we're not going to use the Method of Moments, solving $\theta = g^{-1}(m)$ shows that consistent estimation is possible. \end{itemize} \end{frame} % In 2015 there was a big section here on multivariate regression. \section{A first try} \begin{frame} \frametitle{A first try at including measurement error in the explanatory variable} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(210,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(157,50){\framebox{$W$}} \put(232,50){\framebox{$Y$}} \put(197,15){\vector(-1,1){25}} % X -> W \put(209,15){\vector(1,1){25}} % X -> Y \put(161,95){$e$} \put(165,90){\vector(0,-1){25}} % e -> W \put(236,95){$\epsilon$} \put(240,90){\vector(0,-1){25}} % epsilon -> Y \end{picture} \end{center} \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& X_i + e_i, \end{eqnarray*} Observable data are the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \pause \linebreak Try to fit the true model, meaning estimate the parameters. \end{frame} \begin{frame} \frametitle{Details} \framesubtitle{Make everything normal for simplicity} Independently for $i=1, \ldots, n$, let \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& \nu + X_i + e_i, \end{eqnarray*} where \begin{itemize} \item $X_i$ is normally distributed with mean $\mu_x$ and variance $\phi>0$ \item $\epsilon_i$ is normally distributed with mean zero and variance $\psi>0$ \item $e_i$ is normally distributed with mean zero and variance $\omega>0$ \item $X_i, e_i, \epsilon_i$ are all independent. \end{itemize} Observable data are the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \end{frame} \begin{frame} \frametitle{Model implies that the $(W_i,Y_i)$ are independent bivariate normal} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i $ \linebreak $W_i = \nu + X_i + e_i$ } \pause with \begin{displaymath} E\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \nu + \mu_x \\ \beta_0 + \beta_1\mu_x \end{array} \right), \end{displaymath} \pause and variance-covariance matrix \begin{displaymath} cov\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ \sigma_{12} & \sigma_{22} \end{array} \right) = \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ \beta_1 \phi & \beta_1^2 \phi + \psi \end{array} \right). \end{displaymath} \pause Could we know the parameters if we knew $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$? \end{frame} \begin{frame} \frametitle{Try to solve the moment structure equations} \pause \framesubtitle{$m=g(\theta)$. Solve to obtain $\theta = g^{-1}(m)$} $\boldsymbol{\theta} = (\beta_0, \beta_1, \mu_x, \phi, \psi, \nu, \omega)$ \begin{eqnarray*} \mu_1 & = & \mu_x + \nu \\ \mu_2 & = & \beta_0 + \beta_1\mu_x \\ \sigma_{1,1} & = & \phi+\omega \\ \sigma_{1,2} & = & \beta_1 \phi \\ \sigma_{2,2} & = & \beta_1^2 \phi + \psi \end{eqnarray*} \pause It is impossible to solve these five equations uniquely for the seven model parameters. There are infinitely many solutions. \end{frame} \begin{frame} \frametitle{A numerical example} {\small \begin{eqnarray*} \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) & = & \left( \begin{array}{c} \mu_x+\nu \\ \beta_0 + \beta_1\mu_x \end{array} \right) \\ \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ \cdot & \sigma_{22} \end{array} \right) & = & \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ \cdot & \beta_1^2 \phi + \psi \end{array} \right) \end{eqnarray*} \pause \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|c|} \hline & $\mu_x$ & $\beta_0$ & $\nu$ & $\beta_1$ & $\phi$ & $\omega$ & $\psi$ \\ \hline $\boldsymbol{\theta}_1$ & 0 & 0 & 0 & 1 & 2 & 2 & 3 \\ \hline $\boldsymbol{\theta}_2$ & 0 & 0 & 0 & 2 & 1 & 3 & 1 \\ \hline \end{tabular} \end{center} \pause Both $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ imply a bivariate normal distribution with mean zero and covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left[ \begin{array}{r r} 4 & 2 \\ 2 & 5 \end{array} \right], \end{displaymath} \pause and thus the same distribution of the sample data. } % End size \end{frame} \section{Identifiability} \begin{frame} \frametitle{Parameter Identifiability} %\framesubtitle{} \begin{itemize} \item No matter how large the sample size, it will be impossible to decide between $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$, because they imply exactly the same probability distribution of the observable data. \pause \item The problem here is that the parameters of the regression are not identifiable. %\pause % \item The model parameters cannot be recovered from the distribution of the sample data. \pause % \item And all you can ever learn from sample data is the distribution from which it comes. \pause % \item So there will be problems using the sample data for estimation and inference. \pause % \item This is true even when \emph{the model is completely correct.} \pause % \item In this case the problem is with the data. \end{itemize} \end{frame} \begin{frame} \frametitle{Definitions} %\framesubtitle{Think of $d_i \sim$ Poisson$(\lambda)$ and $d_i \sim $Poisson$(\lambda_1+\lambda_2)$.} \pause If the probability distribution of the observable data is a one-to-one function of the parameter (vector), the parameter (vector) is said to be identifiable. \pause \begin{itemize} \item The probability distribution of the data is always a function of the parameter. \item If the parameter is also a function of the probability distribution, the function is one-to-one and the parameter is identifiable. \pause \item That is, if the parameter can somehow be recovered from the distribution of the data, it is identifiable. \pause \item If two different parameter values yield the same distribution of the data, the parameter is not identifiable. The inverse function cannot exist because functions yield only one value. % (Later) \end{itemize} \end{frame} \begin{frame} \frametitle{Regression model with no measurement error} \framesubtitle{Example of proving identifiablity} $\mathbf{y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{x}_i + \boldsymbol{\epsilon}_i$ \pause \vspace{2mm} \begin{itemize} \item The mean and covariance matrix of $\mathbf{d}_i = \left( \begin{array}{c} \mathbf{x}_i \\ \hline \mathbf{y}_i \end{array} \right)$ are functions of the probability distribution (calculate expected values and covariances). \pause \item To get Method of Moments estimates, we solved for the parameters from the mean and covariance matrix of $\mathbf{d}_i$. \pause \item Therefore the parameters are a function of the probability distribution. \item So they are identifiable. \pause \item This is the way it goes in general. \end{itemize} \end{frame} \begin{frame} \frametitle{Identification of parameters from the moments} %\framesubtitle{In general}\pause \begin{center} \begin{picture}(100,100)(40,40) % Size of picture (does not matter), origin % Play with the origin to position the picture %\begin{picture}(150,150)(0,0) % Initial settings %\graphpaper(0,0)(160,160) % Need \usepackage{graphpap} Size should match picture initially \put(80,130){$\theta$} \put(160,50){$P_\theta$} \put(0,50){$m$} \put(86,131){\vector(1,-1){75}} % theta -> p \put(157,54){\vector(-1,0){145}} % p -> m \put(5,57){\vector(1,1){73}} % m -> theta \put(120,100){$P_\theta = h(\theta)$} \put(65,40){$m = g(\theta)$} \put(-10,100){$\theta = g^{-1}(m)$} \end{picture} \end{center}\pause \begin{itemize} \item $m = g(\theta)$ are the moment structure equations. \item $\theta = g^{-1}(m)$ is the solution of the moment structure equations. \pause \item In this course, parameters will be identified from $\mathbf{m} = (\boldsymbol{\mu}, \boldsymbol{\Sigma})$ (usually just $\boldsymbol{\Sigma}$), or not at all. \end{itemize} \end{frame} \begin{frame} \frametitle{Identification from the moments $\boldsymbol{\mu}(\boldsymbol{\theta})$ and $\boldsymbol{\Sigma}(\boldsymbol{\theta})$ or not at all} \pause %\framesubtitle{} \begin{itemize} \item If the distributions are normal, $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ are all there is. \pause \item If the distributions are unknown, we still have $(\overline{\mathbf{d}}_n, \widehat{\boldsymbol{\Sigma}}_n) \stackrel{p}{\rightarrow} (\boldsymbol{\mu}, \boldsymbol{\Sigma})$. \pause \item If the parameters can be recovered from $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, they can be estimated based on $\overline{\mathbf{d}}_n$ and $\widehat{\boldsymbol{\Sigma}}_n$. \pause \item If the parameters cannot be recovered from $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, we are out of luck. \pause \item So in practice, identifiability means identifiability from the moments. \item Usually just $\boldsymbol{\Sigma}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Non-identifiability} \framesubtitle{Parameter is identifiable if the probability distribution of the observable data is a one-to-one function of the parameter.} If two different parameter values yield the same distribution of the data, the parameter is not identifiable. \begin{center} \includegraphics[width=3in]{2to1} \end{center} \end{frame} % \begin{frame} % \frametitle{Proving that a parameter is \emph{not} identifiable} \pause % \framesubtitle{Based on the moments} % \begin{itemize} % \item You can carefully describe the set of points in the parameter space that yield the same distribution of the observable data. \pause It's a lot of work, even for small models. \pause % \item You can produce a numerical example of two different points that yield the same distribution of the observable data. That settles it. \pause % \item You can use theorems. % \end{itemize} % \end{frame} \begin{frame} \frametitle{Identifiability is a big concept} %\framesubtitle{} \begin{itemize} \item It means \emph{knowability} of the parameters from the distribution of the data. \pause \item We will do simple proofs that show whether certain information can be known. \pause \item Call it the \textbf{algebra of the knowable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation is impossible. \pause \begin{itemize} \item Let $\theta_1 \neq \theta_2$, but $P_{\theta_1}(d_1, \ldots, d_n) = P_{\theta_2}(d_1, \ldots, d_n)$ for all $n$. \pause \item So the distribution of $t_n = t_n(d_1, \ldots, d_n)$ is identical for $\theta_1$ and $\theta_2$. \pause \item Suppose $t_n$ is a consistent estimator of $\theta$. \pause \item Then $t_n \stackrel{p}{\rightarrow} \theta_1$ and $t_n \stackrel{p}{\rightarrow} \theta_2$. \pause \begin{center} \begin{picture}(100,100)(0,0) % Size of picture, origin \put(0,50){\circle{50}} \put(0,50){\circle*{2}} \put(2,52){$\theta_1$} \put(100,50){\circle{50}} \put(100,50){\circle*{2}} \put(102,52){$\theta_2$} \end{picture} \end{center} \pause \item Impossible. \end{itemize} \end{frame} \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} \begin{itemize} \item If a function $g(\boldsymbol{\theta})$ can be recovered from the distribution of the observable data, that function of the parameter vector is said to be identifiable. \pause \item This applies to individual parameters and subsets of the parameters. \pause \item Frequently, not everything can be known, but informative \emph{functions} of the parameter are knowable. \end{itemize} \end{frame} \begin{frame} \frametitle{How maximum likelihood can fail} \framesubtitle{When the parameters are not identifiable} \begin{itemize} \item Two examples based on this result: \item For a random sample from a normal distribution with expected value zero, MLE is \end{itemize} \begin{displaymath} \widehat{\sigma}^2 = \frac{1}{n} \sum_{i=1}^n X_i^2 \end{displaymath} \end{frame} \begin{frame} \frametitle{Disaster One} %\framesubtitle{} {\Large \begin{itemize} \item Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} N(0,\sigma^2_1 + \sigma^2_2)$. \item[] \pause \item $\Theta = \{(\sigma^2_1, \sigma^2_2): \sigma^2_1 > 0, \sigma^2_2 > 0 \}$. \item[] \pause \item Solve and get $\sigma^2_1+\sigma^2_2 = \frac{1}{n} \sum_{i=1}^n X_i^2$. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Disaster Two} %\framesubtitle{} {\Large \begin{itemize} \item Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} N(0,\sigma^2)$. \item[] \pause \item $\Theta = \{\sigma: \sigma \neq 0 \}$. \item[] \pause \item Solve and get $|\sigma| = \sqrt{\frac{1}{n} \sum_{i=1}^n X_i^2}$. \end{itemize} \pause } % End size \vspace{3mm} Which one is worse? \end{frame} \begin{frame} \frametitle{Some sample questions will be based on this model:} Independently for $i = 1, \ldots, n$, let $W_i = X_i + e_i$, where \pause \begin{itemize} \item $X_i\sim N(\mu_x,\phi)$ \item $e_i \sim N(0,\omega)$ \item $X_i$ and $e_i$ are independent. \item Only $W_i$ is observable ($X_i$ is a latent variable). \pause \end{itemize} \vspace{5mm} How does this fit the definition of a \emph{model}? \end{frame} \begin{frame} \frametitle{Sample questions} {\scriptsize Let $W_i = X_i + e_i$, where \begin{itemize} \item $X_i\sim N(\mu_x,\phi)$ \item $e_i \sim N(0,\omega)$ \item $X_i$ and $e_i$ are independent. \item Only $W_i$ is observable ($X_i$ is a latent variable, and $e_i$ is an error term). \pause \end{itemize} } % End size \vspace{3mm} In the following questions, you may use the fact that the normal distribution corresponds uniquely to the pair $(\mu,\sigma^2)$. \pause \begin{enumerate} \item What is the parameter vector $\boldsymbol{\theta}$? \pause \item What is the parameter space $\Theta$? \pause \item What is the probability distribution of the observable data? \pause \item Give the moment structure equations. \pause \item Either prove that the parameter is identifiable, or show by an example that it is not. A simple numerical example is best. \pause \item Give two \emph{functions} of the parameter vector that are identifiable. \end{enumerate} \end{frame} \begin{frame} \frametitle{A Useful Equivalent Definition of Identifiability} \pause \framesubtitle{Equivalent to $P_\theta$ is a one-to-one function of $\theta$} % on $\Theta$ \begin{itemize} \item Suppose a statistical model implies $\mathbf{d} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be identifiable. \pause \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$. \pause \end{itemize} \begin{center} \includegraphics[width=3in]{1to1} \end{center} \end{frame} \begin{frame} \frametitle{Pointwise identifiability} \framesubtitle{As opposed to global identifiability} \begin{itemize} \item Frequently, parameters will be identifiable in some parts of the parameter space but not others. \pause \item The parameter is said to be identifiable at a point $\boldsymbol{\theta}_0$ if no other point in $\Theta$ yields the same probability distribution as $\boldsymbol{\theta}_0$. \item That is, $\boldsymbol{\theta} \neq \boldsymbol{\theta}_0$ implies $P_{\boldsymbol{\theta}} \neq P_{\boldsymbol{\theta}_0}$ for all $\boldsymbol{\theta} \in \Theta$. \pause % \item Let $g(\boldsymbol{\theta})$ be a function of the parameter vector. If $g(\boldsymbol{\theta}_0) \neq g(\boldsymbol{\theta})$ implies $P_{\boldsymbol{\theta}_0} \neq P_{\boldsymbol{\theta}}$ %for all $\boldsymbol{\theta} \in \Theta$, then the function $g(\boldsymbol{\theta})$ is said to be identifiable at the point $\boldsymbol{\theta}_0$. \pause % \item This just means that $g(\boldsymbol{\theta}_0)$ can be recovered from the distribution of the data \pause (through the moments). \pause \end{itemize} If the parameter % (or function of the parameter) is identifiable at at every point in $\Theta$, it is identifiable according to the earlier definitions. \end{frame} % Maybe skip this slide but leave it in the text. % \begin{frame} % \frametitle{Local identifiability} %\framesubtitle{} % \begin{itemize} % \item[] The parameter is said to be \emph{locally identifiable} at a point $\boldsymbol{\theta}_0$ if there is a neighbourhood of points surrounding $\boldsymbol{\theta}_0$, none of which yields the same probability distribution as $\boldsymbol{\theta}_0$. \pause % \item[] If there is a neighborhood of $\theta_0$ with $P_\theta \neq P_{\theta_0}$ for all $\theta \neq \theta_0$ in the neighborhood, the parameter is said to be \emph{locally identifiable} at $\theta_0$. % \item[] If the parameter is identifiable at a point, it is locally identifiable there, but local identifiability does not imply pointwise identifiability. % \end{itemize} % \end{frame} \begin{frame} \frametitle{Determining identifiability in practice} %\framesubtitle{A strictly mathematical task} \begin{itemize} \item In practice, identifiability means that the moment structure equations can be solved uniquely for the parameters. Uniquely means there is only one solution. \item This is a strictly mathematical issue, though it has huge implications for statistical estimation and inference. \end{itemize} \end{frame} \begin{frame} \frametitle{Proving identifiability} %\framesubtitle{} \begin{itemize} \item You can explicitly solve the moment structure equations. \item You can use theorems. \pause \item We will develop a collection of identifiability rules. \item These are really simple theorems about the existence of unique real solutions to equations. %, expressed in terms of identifiability. \pause \item They are not well-known to mathematicians because they are too specific to be interesting. \pause \item We will be able to look at a path diagram and verify that the parameters are identifiable. Usually. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} % This is a deliberate repeat. \frametitle{Proving that a parameter is \emph{not} identifiable} \pause % \framesubtitle{Based on the moments} \begin{itemize} \item You can carefully describe the set of points in the parameter space that yield the same mean and covariance matrix. It's a lot of work, even for small models. \pause \item You can produce a numerical example of two different points that yield the same mean and covariance matrix. \pause That settles it, but is can still be a lot of work for models with a lot of parameters. \pause % In the text, maybe in Ch. 1, describe the following trick, which depends on the fact that if the parameter vector is not identifiable at the point where the likelihood reaches its maximum, there will be multiple maxima, all at the same height. The trick is to make up some sensible parameter values, crank out the covariance matrix, and give that covariance matrix to your software as a sample covariance matrix, along with an arbitrary sample size. ``Estimate" the parameters, using several sets of starting values. If you always come to the same parameter estimates and these match your input parameter values, you may become convinced that the parameter is identifiable, at least at that point, though really you have not proved anything. Perhaps if you try enough sets of reasonable parameter values and enough different starting values for each one, you wil become completely convinced. % If, on the other hand, you find more than one stopping place that is different from your input set, and if the value of the likelihood there is the same as at your starting values (in proc calis, the objective function will be zero), then you may have evidence of non-identifiability. To check, calculate the covariance matrix at the stopping place; our software may do this for you, perhaps calling it the ``reproduced covariance matrix." If this matches your input covariance matrix, you have proved non-identifiability, because you have found two different points in the parameter space that yield the same covariance matrix, and hence the same distribution of the observable data (provided the data are normal). \item You can use a big theorem. \end{itemize} \end{frame} \section{Parameter Count Rule} \begin{frame} \frametitle{The Parameter Count Rule} \framesubtitle{For establishing \textbf{non}-identifiability} Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. \pause \begin{itemize} \item Note that the empty set has volume zero. \pause \item The parameter count rule is really a theorem about the existence of unique real solutions to systems of equations. \item The moment structure equations need to have derivatives and mixed partial derivatives of all orders, but they usually do. \end{itemize} \end{frame} \begin{frame} \frametitle{Back to the example} \framesubtitle{Trying to include measurement error in the model} \begin{center} % Could make equations and path diagram side by side if I had time. See 2017 Quiz 4. \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(210,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(157,50){\framebox{$W$}} \put(232,50){\framebox{$Y$}} \put(197,15){\vector(-1,1){25}} % X -> W \put(209,15){\vector(1,1){25}} % X -> Y \put(161,95){$e$} \put(165,90){\vector(0,-1){25}} % e -> W \put(236,95){$\epsilon$} \put(240,90){\vector(0,-1){25}} % epsilon -> Y \end{picture} \end{center} \pause \begin{itemize} \item Recall the first attempt to include measurement error in the model. \item There were five moment structure equations in seven unknown parameters. \pause \item The model failed the parameter count rule. \item Game over. \end{itemize} \end{frame} \begin{frame} \frametitle{Again: The Parameter Count Rule} %\framesubtitle{For establishing non-identifiability} \pause Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. {\footnotesize \begin{itemize} \item So a \emph{necessary} condition for parameter identifiability is that there be at least as many moment structure equations as parameters. \pause \item There may be points in the parameter space where the parameter is identifiable, but if so, that set of points has volume zero. \pause \item There can be more equations than unknown parameters, and still no unique solution. \pause \item Failure of the parameter count rule means that it's impossible to identify the whole parameter vector. \pause \item Useful functions of the parameters may be identifiable, maybe including what you really want to know. \pause \item Maximum likelihood estimation depends on identifiability of the entire parameter vector (usually). \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{To illustrate the parameter count rule.} There are two latent explanatory variables and two observable response variables. % Put model equations and path diagram side by side. \begin{tabular}{cc} \raisebox{.5in}{\parbox{1.5in} { \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2 \end{eqnarray*} }} % End parbox and then raisebox & \includegraphics[width=2.75in]{CirclePath} \end{tabular} \pause where \begin{itemize} \item $X_1$, $X_2$, $\epsilon_1$ and $\epsilon_2$ are independent normal random variables with expected value zero, and \item $Var(X_1)=Var(X_2)=1$, $Var(\epsilon_1)=\psi_1$ and $Var(\epsilon_2)=\psi_2$. \item Only $Y_1$ and $Y_2$ are observable. \end{itemize} \pause The parameter vector is $\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$. \end{frame} \begin{frame} \frametitle{Calculate the covariance matrix of $(Y_1,Y_2)^\top$} \framesubtitle{Expected value is (zero, zero)} \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} {\LARGE \begin{eqnarray*} \boldsymbol{\Sigma} & = & \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \\ && \\ & = & \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Covariance structure equations} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \begin{itemize} \item Three equations in 4 unknowns, so the model fails the Parametrer Count Rule. \pause \item The rule does \emph{not} say that a solution is impossible. \pause \item It says that \emph{the set of points in the parameter space where there is a unique solution (so the parameters are all identifiable) occupies a set of volume zero}. \ \item Are there any such points at all? \end{itemize} \end{frame} \begin{frame} \frametitle{Try to solve for the parameters} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} % Why is this important? Covariance structure equations: % \vspace{5mm} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \pause \item So those \emph{functions} of the parameter vector are identifiable. \item What about $\beta_1$ and $\beta_2$? \end{itemize} \end{frame} \begin{frame} \frametitle{Can we solve for $\beta_1$ and $\beta_2$?} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size %\vspace{5mm} \begin{itemize} \item $\sigma_{1,2} = 0$ ~ if and only if ~ Both $\beta_1=0$ and $\beta_2=0$. \pause \item The set of points where all four parameters can be recovered from the covariance matrix is \emph{exactly} the set of points where the parameter vector is identifiable. \item It is \begin{displaymath} \{(\beta_1, \beta_2, \psi_1, \psi_2): \beta_1=0, \beta_2=0, \psi_1>0, \psi_2>0 \} \end{displaymath} \pause \item A set of infinitely many points in $\mathbb{R}^4$ \item A set of volume zero, as the theorem says. \end{itemize} \end{frame} \begin{frame} \frametitle{Suppose $\beta_1^2 + \beta_2^2 \neq 0$} \framesubtitle{This is the case ``almost everywhere" in the parameter space.} \pause The set of infinitely many points $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause Substitute back into \begin{displaymath} cov\left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} And see they all produce the covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \end{displaymath} \pause And hence the same bivariate normal distribution of $(Y_1,Y_2)^\top$. \end{frame} \begin{frame} \frametitle{Why are there infinitely many points in this set?} $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} \pause \vspace{15mm} Because $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ is the equation of a circle with radius $\sqrt{\sigma_{1,2}}$. \end{frame} \begin{frame} \frametitle{Maximum likelihood estimation} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} { \scriptsize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \\ \pause L(\boldsymbol{\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-n} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + \overline{\mathbf{x}}^\top \boldsymbol{\Sigma}^{-1} \overline{\mathbf{x}} \right\} \end{eqnarray*} \pause } % End size \vspace{15mm} Can write likelihood as either $L(\boldsymbol{\Sigma})$ or $L(\boldsymbol{\Sigma}(\boldsymbol{\theta})) = L_2(\boldsymbol{\theta})$. \pause \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Likelihood $L_2(\boldsymbol{\theta})$ has non-unique maximum} %\framesubtitle{} \begin{itemize} \item $L(\boldsymbol{\Sigma})$ has a unique maximum at $\boldsymbol{\Sigma} = \widehat{\boldsymbol{\Sigma}}$. \pause \item For \emph{every} positive definite $\boldsymbol{\Sigma}$ with $\sigma_{1,2} > 0$, there are infinitely many $\boldsymbol{\theta} \in \Theta$ which produce that $\boldsymbol{\Sigma}$, and have the same height of the likelihood. \pause \item This includes $\widehat{\boldsymbol{\Sigma}}$, assuming $\widehat{\sigma}_{12}>0$. \pause \item So there are infinitely many points $\boldsymbol{\theta}$ in $\Theta$ with $L_2(\boldsymbol{\theta}) = L(\widehat{\boldsymbol{\Sigma}})$. \pause \item A circle in $\mathbb{R}^4$. \end{itemize} \end{frame} \begin{frame} \frametitle{A circle in $\mathbb{R}^4$ where the likelihood is maximal} %\framesubtitle{} {\LARGE $\{(\beta_1, \beta_2, \psi_1, \psi_2)\} \subset \mathbb{R}^4$ such that \begin{itemize} \item $\psi_1 = \widehat{\sigma}_{1,1}-\widehat{\sigma}_{1,2}$ \item $\psi_2 = \widehat{\sigma}_{2,2} - \widehat{\sigma}_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \widehat{\sigma}_{1,2}$ \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Some Questions} \framesubtitle{About model correctness} Remembering that if the model is true, \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause What would happen in the numerical search for $\widehat{\boldsymbol{\theta}}$ if \dots \begin{itemize} \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{1,1}$? \pause \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{2,2}$? \pause \item $\widehat{\sigma}_{1,2} < 0$? \pause \end{itemize} These could not \emph{all} happen, but one of them could. When numerical maximum likelihood search leaves the parameter space, it may indicate that the model is incorrect. \pause Or it might be just a bad starting value. \pause \vspace{1mm} Could the maximum of the likelihood function be outside the parameter space? \end{frame} \begin{frame} \frametitle{Testing hypotheses about $\boldsymbol{\theta}$} \framesubtitle{For a model like this one, with non-identifiable parameters} Some hypotheses are testable if the model is correct, but direct likelihood ratio tests are out. All the theory depends on a unique maximum. \pause \vspace{5mm} Remember, \begin{displaymath} cov\left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \pause % \vspace{5mm} \begin{itemize} \item How would you test $H_0:\beta_1=\beta_2=0$? \pause \item If you did a large-sample likelihood ratio test, what would the degrees of freedom be? \end{itemize} \end{frame} \begin{frame} \frametitle{Lessons from this example} %\framesubtitle{} { \footnotesize \begin{itemize} \item A parameter may be identifiable at some points but not others. \pause \item Identifiability at infinitely many points is possible even if there are more unknowns than equations. But this can only happen on a set of volume zero. \pause \item Some parameters and functions of the parameters may be identifiable even when the whole parameter vector is not. \pause \item Lack of identifiability can produce multiple maxima of the likelihood function -- even infinitely many. \pause \item A model whose parameter vector is not identifiable may still be falsified by empirical data. \pause \item Numerical maximum likelihood search may leave the parameter space. This may be a sign that the model is false. It can happen when the parameter is identifiable, too. \pause \item Some hypotheses may be testable when the parameter is not identifiable, \pause but these will be hypotheses about functions of the parameter that are identifiable in the part of the parameter space where the null hypothesis is true. \pause $H_0:\beta_1=\beta_2=0$ \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s23} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s23}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(P_{\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame} \begin{frame} \frametitle{Two different parameter values yielding the same distribution of the data} \framesubtitle{Inverse function does not exist} \begin{center} \includegraphics[width=3in]{2to1} \end{center} \end{frame}