% This aims to be much briefer than the 431a17 version % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{graphpap} \usepackage{comment} % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Including Measurement Error in the Regression Model: A First Try\footnote{See last slide for copyright information.}} \subtitle{STA2053 Fall 2022} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Moment Structure Equations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Moments and Moment Structure Equations} \begin{equation*} \mbox{Model: }d \sim P_\theta \end{equation*} \pause \begin{itemize} \item \emph{Moments} of a distribution are quantities such $E(X)$, $E(Y^2)$, $Var(X)$, $E(X^2Y^2)$, $Cov(X,Y)$, and so on. \item \emph{Moment structure equations} are a set of equations expressing moments of the distribution of the observable data in terms of the model parameters: $m = g(\theta)$ \pause \item If there are just variances and covariances, the moment structure equations are called \emph{covariance structure equations}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Important process} \begin{itemize} \item Calculate the moments of the distribution: $m = g(\theta)$. \pause \item Solve the moment structure equations for the parameters: $\theta = g^{-1}(m)$. \pause \item Method of Moments: $\widehat{\theta} = g^{-1}(\widehat{m})$. \pause \item By LLN and Continuous mapping, $\widehat{\theta}\stackrel{p}{\rightarrow} \theta$ \item Showing that consistent estimation is possible. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Multivariate multiple regression} \framesubtitle{With just observed variables} \begin{displaymath} \mathbf{y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{x}_i + \boldsymbol{\epsilon}_i \end{displaymath} {\scriptsize where \begin{itemize} \item[] $\mathbf{y}_i$ is an $q \times 1$ random vector of observable response variables, so the regression can be multivariate; there are $q$ response variables. \item[] $\boldsymbol{\beta}_0$ is a $q \times 1$ vector of unknown constants, the intercepts for the $q$ regression equations. There is one for each response variable. \item[] $\mathbf{x}_i$ is a $p \times 1$ observable random vector; there are $p$ explanatory variables. $\mathbf{x}_i$ has expected value $\boldsymbol{\mu}_x$ and variance-covariance matrix $\boldsymbol{\Phi}$, a $p \times p$ symmetric and positive definite matrix of unknown constants. \item[] $\boldsymbol{\beta}_1$ is a $q \times p$ matrix of unknown constants. These are the regression coefficients, with one row for each response variable and one column for each explanatory variable. \item[] $\boldsymbol{\epsilon}_i$ is the error term of the latent regression. It is a $q \times 1$ random vector with expected value zero and variance-covariance matrix $\boldsymbol{\Psi}$, a $q \times q$ symmetric and positive definite matrix of unknown constants. $\boldsymbol{\epsilon}_i$ is independent of $\mathbf{x}_i$. \end{itemize} } % End size %\vspace{3mm} \pause \begin{equation*} \boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi}) \end{equation*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\mathbf{d}_i = \left( \begin{array}{c} \mathbf{x}_i \\ \hline \mathbf{y}_i \end{array} \right)$: Write $E(\mathbf{d}_i)$ and $cov(\mathbf{d}_i)$ as partitioned matrices} % \framesubtitle{Write $E(\mathbf{d}_i)$ and $cov(\mathbf{d}_i)$ as partitioned matrices} \pause \begin{displaymath} \boldsymbol{\mu} = \left( \begin{array}{c} E(\mathbf{x}_i) \\ \hline E(\mathbf{y}_i) \end{array} \right) = \left( \begin{array}{c} \boldsymbol{\mu}_1 \\ \hline \boldsymbol{\mu}_2 \end{array} \right) \end{displaymath} and \renewcommand{\arraystretch}{1.5} \begin{displaymath} \boldsymbol{\Sigma} = cov\left( \begin{array}{c} \mathbf{x}_i \\ \hline \mathbf{y}_i \end{array} \right) = \left( \begin{array}{c|c} cov(\mathbf{x}_i) & cov(\mathbf{x}_i,\mathbf{y}_i) \\ \hline cov(\mathbf{x}_i,\mathbf{y}_i)^\top & cov(\mathbf{y}_i) \end{array} \right) = \left( \begin{array}{c|c} \boldsymbol{\Sigma}_{11} & \boldsymbol{\Sigma}_{12} \\ \hline \boldsymbol{\Sigma}_{12}^\top & \boldsymbol{\Sigma}_{22} \end{array} \right) \end{displaymath} \pause \renewcommand{\arraystretch}{1.0} \begin{equation*} \mathbf{m} = \left( \boldsymbol{\mu}_1, \boldsymbol{\mu}_2, \boldsymbol{\Sigma}_{11}, \boldsymbol{\Sigma}_{12}, \boldsymbol{\Sigma}_{22}\right) \end{equation*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Moment structure equations} \framesubtitle{Based on $\mathbf{y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{x}_i + \boldsymbol{\epsilon}_i$} \begin{itemize} \item[] $\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$ \item[] $\mathbf{m} = \left( \boldsymbol{\mu}_1, \boldsymbol{\mu}_2, \boldsymbol{\Sigma}_{11}, \boldsymbol{\Sigma}_{12}, \boldsymbol{\Sigma}_{22}\right)$ \end{itemize} \pause \begin{eqnarray*} \boldsymbol{\mu}_1 & = & \boldsymbol{\mu}_x \\ \boldsymbol{\mu}_2 & = & \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \boldsymbol{\mu}_x \nonumber \\ \boldsymbol{\Sigma}_{11} & = & \boldsymbol{\Phi} \nonumber \\ \boldsymbol{\Sigma}_{12} & = & \boldsymbol{\Phi\beta}_1^\top \nonumber \\ \boldsymbol{\Sigma}_{22} & = & \boldsymbol{\beta}_1 \boldsymbol{\Phi\beta}_1^\top + \boldsymbol{\Psi}. \nonumber \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Solve moment structure equations for the parameters} \framesubtitle{$\theta = g^{-1}(m)$} \begin{eqnarray*} \boldsymbol{\beta}_0 & = & \boldsymbol{\mu}_2 - \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1} \ \boldsymbol{\mu}_1 \\ \boldsymbol{\mu}_x & = & \boldsymbol{\mu}_1 \nonumber \\ \boldsymbol{\Phi}_{~} & = & \boldsymbol{\Sigma}_{11} \nonumber \\ \boldsymbol{\beta}_1 & = & \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1} \nonumber \\ \boldsymbol{\Psi}_{~} & = & \boldsymbol{\Sigma}_{22} - \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1}\boldsymbol{\Sigma}_{12} \nonumber \end{eqnarray*} \pause \vspace{10mm} \begin{itemize} \item Just put hats on everything to get MOM estimates. \item Same as the MLEs in this case by invariance. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{But let's admit it} {\LARGE In most applications, the explanatory variables are measured with error. } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{A first try} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A first try at including measurement error in the explanatory variable} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(210,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(157,50){\framebox{$W$}} \put(232,50){\framebox{$Y$}} \put(197,15){\vector(-1,1){25}} % X -> W \put(209,15){\vector(1,1){25}} % X -> Y \put(161,95){$e$} \put(165,90){\vector(0,-1){25}} % e -> W \put(236,95){$\epsilon$} \put(240,90){\vector(0,-1){25}} % epsilon -> Y \end{picture} \end{center} \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& X_i + e_i, \nonumber \end{eqnarray*} Observable data are the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \linebreak Try to fit the true model. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Details} \framesubtitle{Make everything normal for simplicity} Independently for $i=1, \ldots, n$, let \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& \nu + X_i + e_i, \end{eqnarray*} where \begin{itemize} \item $X_i$ is normally distributed with mean $\mu_x$ and variance $\phi>0$ \item $\epsilon_i$ is normally distributed with mean zero and variance $\psi>0$ \item $e_i$ is normally distributed with mean zero and variance $\omega>0$ \item $X_i, e_i, \epsilon_i$ are all independent. \end{itemize} Observable data are the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Model implies that the $(W_i,Y_i)$ are independent bivariate normal} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i $ \linebreak $W_i = \nu + X_i + e_i$ } \pause with \begin{displaymath} E\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \nu + \mu_x \\ \beta_0 + \beta_1\mu_x \end{array} \right), \end{displaymath} \pause and variance-covariance matrix \begin{displaymath} cov\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\Sigma} = [\sigma_{i,j}] = \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ \beta_1 \phi & \beta_1^2 \phi + \psi \end{array} \right). \end{displaymath} \pause \vspace{2mm} Fit with maximum likelihood? {\small \begin{equation*} L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \end{equation*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Big problem revealed by the moment structure equations} \framesubtitle{$m=g(\theta)$. Solve to obtain $\theta = g^{-1}(m)$} $\boldsymbol{\theta} = (\beta_0, \beta_1, \mu_x, \phi, \psi, \nu, \omega)$ \pause \begin{eqnarray*} \mu_1 & = & \mu_x + \nu \\ \mu_2 & = & \beta_0 + \beta_1\mu_x \\ \sigma_{1,1} & = & \phi+\omega \\ \sigma_{1,2} & = & \beta_1 \phi \\ \sigma_{2,2} & = & \beta_1^2 \phi + \psi \end{eqnarray*} \pause It is impossible to solve these five equations uniquely for the seven model parameters. % \pause There are infinitely many solutions. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % This slide is too redundant. Talk it! % \begin{frame} % \frametitle{Impossible to solve the moment structure equations for the parameters} % \begin{itemize} % \item Even with perfect knowledge of the probability distribution of the data (and for the multivariate normal, that means knowing $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, period), \pause % it would be impossible to know the model parameters. \pause % \item All data can ever tell you is the approximate distribution from which they come. \pause % \item So how could we expect to successfully \emph{estimate} $\boldsymbol{\theta}$ based on sample data? % \end{itemize} % \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A numerical example} {\small \begin{eqnarray*} \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) & = & \left( \begin{array}{c} \mu_x+\nu \\ \beta_0 + \beta_1\mu_x \end{array} \right) \\ \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ & \sigma_{22} \end{array} \right) & = & \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ & \beta_1^2 \phi + \psi \end{array} \right) \end{eqnarray*} \pause \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|c|} \hline & $\mu_x$ & $\beta_0$ & $\nu$ & $\beta_1$ & $\phi$ & $\omega$ & $\psi$ \\ \hline $\boldsymbol{\theta}_1$ & 0 & 0 & 0 & 1 & 2 & 2 & 3 \\ \hline $\boldsymbol{\theta}_2$ & 0 & 0 & 0 & 2 & 1 & 3 & 1 \\ \hline \end{tabular} \end{center} \pause Both $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ imply a bivariate normal distribution with mean zero and covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left[ \begin{array}{r r} 4 & 2 \\ 2 & 5 \end{array} \right], \end{displaymath} and thus the same distribution of the sample data. } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Identifiability} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Parameter Identifiability} %\framesubtitle{} \begin{itemize} \item No matter how large the sample size, it will be impossible to decide between $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$, because they imply exactly the same probability distribution of the observable data. \pause \item The problem here is that the parameters of the regression are not identifiable. %\pause % \item The model parameters cannot be recovered from the distribution of the sample data. \pause % \item And all you can ever learn from sample data is the distribution from which it comes. \pause % \item So there will be problems using the sample data for estimation and inference. \pause % \item This is true even when \emph{the model is completely correct.} \pause % \item In this case the problem is with the data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Definition of Identifiability} \framesubtitle{One of many} \pause If the probability distribution of the observable data is a one-to-one function of the parameter (vector), the parameter (vector) is said to be identifiable. \pause \begin{itemize} \item The probability distribution of the data is always a function of the parameter. \item If the parameter is also a function of the probability distribution, the function is one-to-one and the parameter is identifiable. \pause \item If the parameter can somehow be recovered from the distribution of the data, it is identifiable. % \pause \item If two different parameter values yield the same distribution of the data, the parameter is not identifiable. \pause % The inverse function cannot exist because functions yield only one value. \item If the parameter is not knowable from the distribution of the data, there will be trouble with estimation. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation is impossible. \pause \begin{itemize} \item Let $\theta_1 \neq \theta_2$ but $P_{\theta_1}(d_n) = P_{\theta_2}(d_n)$ for all $n$. \pause \item So the distribution of $T_n = T_n(D_1, \ldots, D_n)$ is identical for $\theta_1$ and $\theta_2$. \pause \item Suppose $T_n$ is a consistent estimator of $\theta$. \item Then $T_n \stackrel{p}{\rightarrow} \theta_1$ and $T_n \stackrel{p}{\rightarrow} \theta_2$. \pause \begin{center} \begin{picture}(100,100)(0,0) % Size of picture, origin \put(0,50){\circle{50}} \put(0,50){\circle*{2}} \put(2,52){$\theta_1$} \put(100,50){\circle{50}} \put(100,50){\circle*{2}} \put(102,52){$\theta_2$} \end{picture} \end{center} \pause \item Impossible. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{~} %\framesubtitle{} \begin{center}{\LARGE The zipper example} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identification of parameters from the moments} %\framesubtitle{In general}\pause \begin{center} \begin{picture}(100,100)(40,40) % Size of picture (does not matter), origin % Play with the origin to position the picture %\begin{picture}(150,150)(0,0) % Initial settings %\graphpaper(0,0)(160,160) % Need \usepackage{graphpap} Size should match picture initially \put(80,130){$\theta$} \put(160,50){$P_\theta$} \put(0,50){$m$} \put(86,131){\vector(1,-1){75}} % theta -> p \put(157,54){\vector(-1,0){145}} % p -> m \put(5,57){\vector(1,1){73}} % m -> theta \put(120,100){$P_\theta = h(\theta)$} \put(65,40){$m = g(\theta)$} \put(-10,100){$\theta = g^{-1}(m)$} \end{picture} \end{center} \begin{itemize} \item $m = g(\theta)$ are the moment structure equations. \item $\theta = g^{-1}(m)$ is the solution of the moment structure equations. \pause \item In this course, parameters will be identified from $\mathbf{m} = (\boldsymbol{\mu}, \boldsymbol{\Sigma})$ (usually just $\boldsymbol{\Sigma}$), or not at all. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} \begin{frame} \frametitle{Identification from the moments $\boldsymbol{\mu}(\boldsymbol{\theta})$ and $\boldsymbol{\Sigma}(\boldsymbol{\theta})$ or not at all} \pause %\framesubtitle{} \begin{itemize} \item If the distributions are normal, $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ are all there is. \pause \item If the distributions are unknown, we still have $(\overline{\mathbf{D}}_n, \widehat{\boldsymbol{\Sigma}}_n) \stackrel{p}{\rightarrow} (\boldsymbol{\mu}, \boldsymbol{\Sigma})$. \pause \item If the parameters can be recovered from $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, they can be estimated based on $\overline{\mathbf{D}}_n$ and $\widehat{\boldsymbol{\Sigma}}_n$. \pause \item If the parameters cannot be recovered from $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, we are out of luck. \pause \item So in practice, identifiability means identifiability from the moments. \pause \item Usually just $\boldsymbol{\Sigma}$. \end{itemize} \end{frame} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Non-identifiability} \framesubtitle{Parameter is identifiable if the probability distribution of the observable data is a one-to-one function of the parameter.} If two different parameter values yield the same distribution of the data, the parameter is not identifiable. \begin{center} \includegraphics[width=3in]{2to1} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \begin{frame} % \frametitle{Proving that a parameter is \emph{not} identifiable} \pause % \framesubtitle{Based on the moments} % \begin{itemize} % \item You can carefully describe the set of points in the parameter space that yield the same distribution of the observable data. \pause It's a lot of work, even for small models. \pause % \item You can produce a numerical example of two different points that yield the same distribution of the observable data. That settles it. \pause % \item You can use theorems. % \end{itemize} % \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{comment} \begin{frame} \frametitle{Identifiability is a big concept} \pause %\framesubtitle{} \begin{itemize} \item It means \emph{knowability} of the parameters from the distribution of the data. \pause \item We will do simple proofs that show whether certain information can be known. \pause \item Call it the \textbf{algebra of the knowable}. \end{itemize} \end{frame} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} \pause \begin{itemize} \item If a function $g(\boldsymbol{\theta})$ can be recovered from the distribution of the observable data, that function of the parameter vector is said to be identifiable. \pause \item This applies to individual parameters and subsets of the parameters. \pause \item Frequently, not everything can be known, but informative \emph{functions} of the parameter are knowable. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Some sample questions will be based on this model:} % How does this fit the definition of a \emph{model}? {\scriptsize Let $W_i = X_i + e_i$, where \begin{itemize} \item $X_i\sim N(\mu_x,\phi)$ \item $e_i \sim N(0,\omega)$ \item $X_i$ and $e_i$ are independent. \item Only $W_i$ is observable ($X_i$ is a latent variable). \end{itemize} \pause } % End size \vspace{3mm} In the following questions, you may use the fact that the normal distribution corresponds uniquely to the pair $(\mu,\sigma^2)$. \pause \begin{enumerate} \item What is the parameter vector $\boldsymbol{\theta}$? \item What is the parameter space $\Theta$? \item What is the probability distribution of the observable data? \item Give the moment structure equations. \item Either prove that the parameter is identifiable, or show by an example that it is not. A simple numerical example is best. \item Give two \emph{functions} of the parameter vector that are identifiable. \end{enumerate} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A Useful Equivalent Definition of Identifiability} \pause \framesubtitle{Equivalent to $P_\theta$ is a one-to-one function of $\theta$} % on $\Theta$ \begin{itemize} \item Suppose a statistical model implies $\mathbf{D} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be identifiable. \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$. \pause \end{itemize} \begin{center} \includegraphics[width=3in]{1to1} \end{center} \end{frame} % herehere slide 25 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Pointwise identifiability} \framesubtitle{As opposed to global identifiability} \begin{itemize} % \item Frequently, parameters will be identifiable in some parts of the parameter space but not others. \item The parameter is said to be identifiable at a point $\boldsymbol{\theta}_0$ if no other point in $\Theta$ yields the same probability distribution as $\boldsymbol{\theta}_0$. \item That is, $\boldsymbol{\theta} \neq \boldsymbol{\theta}_0$ implies $P_{\boldsymbol{\theta}} \neq P_{\boldsymbol{\theta}_0}$ for all $\boldsymbol{\theta} \in \Theta$. % \item Let $g(\boldsymbol{\theta})$ be a function of the parameter vector. If $g(\boldsymbol{\theta}_0) \neq g(\boldsymbol{\theta})$ implies $P_{\boldsymbol{\theta}_0} \neq P_{\boldsymbol{\theta}}$ %for all $\boldsymbol{\theta} \in \Theta$, then the function $g(\boldsymbol{\theta})$ is said to be identifiable at the point $\boldsymbol{\theta}_0$. \pause % \item This just means that $g(\boldsymbol{\theta}_0)$ can be recovered from the distribution of the data \pause (through the moments). \pause \end{itemize} If the parameter % (or function of the parameter) is identifiable at at every point in $\Theta$, it is identifiable according to the earlier definitions. \end{frame} % Maybe skip this slide but leave it in the text. % \begin{frame} % \frametitle{Local identifiability} %\framesubtitle{} % \begin{itemize} % \item[] The parameter is said to be \emph{locally identifiable} at a point $\boldsymbol{\theta}_0$ if there is a neighbourhood of points surrounding $\boldsymbol{\theta}_0$, none of which yields the same probability distribution as $\boldsymbol{\theta}_0$. \pause % \item[] If there is a neighborhood of $\theta_0$ with $P_\theta \neq P_{\theta_0}$ for all $\theta \neq \theta_0$ in the neighborhood, the parameter is said to be \emph{locally identifiable} at $\theta_0$. % \item[] If the parameter is identifiable at a point, it is locally identifiable there, but local identifiability does not imply pointwise identifiability. % \end{itemize} % \end{frame} % herehere \begin{frame} \frametitle{Determining identifiability in practice} %\framesubtitle{A strictly mathematical task} \begin{itemize} \item In practice, identifiability means that the moment structure equations can be solved uniquely for the parameters. % Repeat earlier picture. \begin{center} \begin{picture}(100,100)(40,40) % Size of picture (does not matter), origin % Play with the origin to position the picture %\begin{picture}(150,150)(0,0) % Initial settings %\graphpaper(0,0)(160,160) % Need \usepackage{graphpap} Size should match picture initially \put(80,130){$\theta$} \put(160,50){$P_\theta$} \put(0,50){$m$} \put(86,131){\vector(1,-1){75}} % theta -> p \put(157,54){\vector(-1,0){145}} % p -> m \put(5,57){\vector(1,1){73}} % m -> theta \put(120,100){$P_\theta = h(\theta)$} \put(65,40){$m = g(\theta)$} \put(-10,100){$\theta = g^{-1}(m)$} \end{picture} \end{center} % \item This is a strictly mathematical issue, though it has huge implications for statistical estimation and inference. \end{itemize} \pause \end{frame} \begin{frame} \frametitle{Proving identifiability} \pause %\framesubtitle{} \begin{itemize} \item You can explicitly solve the moment structure equations. \pause \item You can use theorems. \pause \item We will develop a collection of identifiability rules. \item These are really simple theorems about the existence of unique real solutions to equations. %, expressed in terms of identifiability. \pause \item They are not well-known to mathematicians because they are too specific to be interesting. \pause \item We will be able to look at a path diagram and verify that the parameters are identifiable. \pause Usually. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} % This is a deliberate repeat. \frametitle{Proving that a parameter is \emph{not} identifiable} \pause % \framesubtitle{Based on the moments} \begin{itemize} \item You can carefully describe the set of points in the parameter space that yield the same mean and covariance matrix. It's a lot of work, even for small models. \pause \item You can produce a numerical example of two different points that yield the same mean and covariance matrix. \pause That settles it, but is can still be a lot of work for big models. \pause % In the text, maybe in Ch. 1, describe the following trick, which depends on the fact that if the parameter vector is not identifiable at the point where the likelihood reaches its maximum, there will be multiple maxima, all at the same height. The trick is to make up some sensible parameter values, crank out the covariance matrix, and give that covariance matrix to your software as a sample covariance matrix, along with an arbitrary sample size. ``Estimate" the parameters, using several sets of starting values. If you always come to the same parameter estimates and these match your input parameter values, you may become convinced that the parameter is identifiable, at least at that point, though really you have not proved anything. Perhaps if you try enough sets of reasonable parameter values and enough different starting values for each one, you wil become completely convinced. % If, on the other hand, you find more than one stopping place that is different from your input set, and if the value of the likelihood there is the same as at your starting values (in proc calis, the objective function will be zero), then you may have evidence of non-identifiability. To check, calculate the covariance matrix at the stopping place; our software may do this for you, perhaps calling it the ``reproduced covariance matrix." If this matches your input covariance matrix, you have proved non-identifiability, because you have found two different points in the parameter space that yield the same covariance matrix, and hence the same distribution of the observable data (provided the data are normal). \item You can use a big theorem. \end{itemize} \end{frame} \section{Parameter Count Rule} \begin{frame} \frametitle{Theorem} \framesubtitle{For us, the $x$ variables are parameters and the $y$ variables are moments.} Let \begin{eqnarray} y_1 & = & f_1(x_1, \ldots, x_p) \nonumber \\ y_2 & = & f_2(x_1, \ldots, x_p) \nonumber \\ \vdots & & ~~~~~~~\vdots \nonumber \\ y_q & = & f_q(x_1, \ldots, x_p), \nonumber \end{eqnarray} \vspace{3mm} If the functions $f_1, \ldots, f_q$ are analytic (posessing a Taylor expansion) and $p>q$, the set of points $(x_1, \ldots, x_p)$ where the system of equations has a unique solution occupies at most a set of volume zero in $\mathbb{R}^p$. \end{frame} \begin{frame} \frametitle{The Parameter Count Rule} \framesubtitle{For establishing non-identifiability} \pause Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. \pause \begin{itemize} \item Note that the empty set has volume zero. \item The parameter count rule is really a theorem about the existence of unique real solutions to systems of equations. \item The moment structure equations need to have derivatives and mixed partial derivatives of all orders, but they usually do. \end{itemize} \end{frame} \begin{frame} \frametitle{Back to the example} \framesubtitle{Trying to include measurement error in the model} \begin{itemize} \item Recall the first attempt to include measurement error in the model. \begin{center} % Could make equations and path diagram side by side if I had time. See 2017 Quiz 4. \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(210,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(157,50){\framebox{$W$}} \put(232,50){\framebox{$Y$}} \put(197,15){\vector(-1,1){25}} % X -> W \put(209,15){\vector(1,1){25}} % X -> Y \put(161,95){$e$} \put(165,90){\vector(0,-1){25}} % e -> W \put(236,95){$\epsilon$} \put(240,90){\vector(0,-1){25}} % epsilon -> Y \end{picture} \end{center} \pause \item There were five moment structure equations in seven unknown parameters. \item The model failed the parameter count rule. \item Game over. \end{itemize} \end{frame} \begin{frame} \frametitle{Again: The Parameter Count Rule} %\framesubtitle{For establishing non-identifiability} \pause Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. \pause {\footnotesize \begin{itemize} \item So a necessary condition for parameter identifiability is that there be at least as many moment structure equations as parameters. \item There can be more equations than unknown parameters, and still no unique solution. \pause \item There may be points in the parameter space where the parameter is identifiable, but if so, that set of points has volume zero. \pause \item Failure of the parameter count rule means that it's impossible to identify the whole parameter vector. \pause \item Useful functions of the parameters may be identifiable, maybe including what you really want to know. \pause \item Maximum likelihood estimation depends on identifiability of the entire parameter vector (usually). \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{To illustrate the parameter count rule.} \pause There are two latent explanatory variables and two observable response variables. % Put model equations and path diagram side by side. \begin{tabular}{cc} \raisebox{.5in}{\parbox{1.5in} { \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2 \end{eqnarray*} }} % End parbox and then raisebox & \includegraphics[width=2.75in]{CirclePath} \end{tabular} \pause where \begin{itemize} \item $X_1$, $X_2$, $\epsilon_1$ and $\epsilon_2$ are independent normal random variables with expected value zero, and \item $Var(X_1)=Var(X_2)=1$, $Var(\epsilon_1)=\psi_1$ and $Var(\epsilon_2)=\psi_2$. \item Only $Y_1$ and $Y_2$ are observable. \end{itemize} \pause The parameter vector is $\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$. \end{frame} \begin{frame} \frametitle{Calculate the covariance matrix of $(Y_1,Y_2)^\top$} \framesubtitle{Expected value is (zero, zero)} \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} \pause {\LARGE \begin{eqnarray*} \boldsymbol{\Sigma} & = & \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \\ && \\ & = & \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Covariance structure equations} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause \begin{itemize} \item Three equations in 4 unknowns, so the model fails. \pause \item Parameter count rule does \emph{not} say that a solution is impossible. \pause \item It says that \emph{the set of points in the parameter space where there is a unique solution (so the parameters are all identifiable) occupies a set of volume zero}. \pause \item Are there any such points at all? \end{itemize} \end{frame} \begin{frame} \frametitle{Try to solve for the parameters} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} % Why is this important? Covariance structure equations: % \vspace{5mm} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \pause \item So those \emph{functions} of the parameter vector are identifiable. \pause \item What about $\beta_1$ and $\beta_2$? \end{itemize} \end{frame} \begin{frame} \frametitle{Can we solve for $\beta_1$ and $\beta_2$?} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size %\vspace{5mm} \begin{itemize} \item $\sigma_{1,2} = 0$ ~ if and only if ~ Both $\beta_1=0$ and $\beta_2=0$. \pause \item The set of points where all four parameters can be recovered from the covariance matrix is \emph{exactly} the set of points where the parameter vector is identifiable. \pause \item It is \begin{displaymath} \{(\beta_1, \beta_2, \psi_1, \psi_2): \beta_1=0, \beta_2=0, \psi_1>0, \psi_2>0 \} \end{displaymath} \pause \item A set of infinitely many points in $\mathbb{R}^4$ \pause \item A set of volume zero, as the theorem says. % file p. 107 \end{itemize} \end{frame} \begin{frame} \frametitle{Suppose $\beta_1^2 + \beta_2^2 \neq 0$} \framesubtitle{This is the case ``almost everywhere" in the parameter space.} \pause The set of infinitely many points $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause Substitute back into \begin{displaymath} cov\left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \pause And see they all produce the covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \end{displaymath} And hence the same bivariate normal distribution of $(Y_1,Y_2)^\top$. \end{frame} \begin{frame} \frametitle{Why are there infinitely many points in this set?} $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} \pause \vspace{15mm} Because $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ is the equation of a circle with radius $\sqrt{\sigma_{1,2}}$. \end{frame} \begin{frame} \frametitle{Maximum likelihood estimation} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} { \scriptsize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \\ \pause L(\boldsymbol{\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-n} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + \overline{\mathbf{x}}^\top \boldsymbol{\Sigma}^{-1} \overline{\mathbf{x}} \right\} \end{eqnarray*} \pause } % End size \vspace{15mm} Can write likelihood as either $L(\boldsymbol{\Sigma})$ or $L(\boldsymbol{\Sigma}(\boldsymbol{\theta})) = L_2(\boldsymbol{\theta})$. \pause \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Likelihood $L_2(\boldsymbol{\theta})$ has non-unique maximum} %\framesubtitle{} \begin{itemize} \item $L(\boldsymbol{\Sigma})$ has a unique maximum at $\boldsymbol{\Sigma} = \widehat{\boldsymbol{\Sigma}}$. \pause \item For every positive definite $\boldsymbol{\Sigma}$ with $\sigma_{1,2} \neq 0$, there are infinitely many $\boldsymbol{\theta} \in \Theta$ which produce that $\boldsymbol{\Sigma}$, and have the same height of the likelihood. \pause \item This includes $\widehat{\boldsymbol{\Sigma}}$. \pause \item So there are infinitely many points $\boldsymbol{\theta}$ in $\Theta$ with $L_2(\boldsymbol{\theta}) = L(\widehat{\boldsymbol{\Sigma}})$. \pause \item A circle in $\mathbb{R}^4$. \end{itemize} \end{frame} \begin{frame} \frametitle{A circle in $\mathbb{R}^4$ where the likelihood is maximal} %\framesubtitle{} {\LARGE $\{(\beta_1, \beta_2, \psi_1, \psi_2)\} \subset \mathbb{R}^4$ such that \begin{itemize} \item $\psi_1 = \widehat{\sigma}_{1,1}-\widehat{\sigma}_{1,2}$ \item $\psi_2 = \widehat{\sigma}_{2,2} - \widehat{\sigma}_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \widehat{\sigma}_{1,2}$ \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Some Questions} % \framesubtitle{About model correctness} Remembering that if the model is true, \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause What would happen in the numerical search for $\widehat{\boldsymbol{\theta}}$ if \dots \begin{itemize} \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{1,1}$? \pause \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{2,2}$? \pause \item $\widehat{\sigma}_{1,2} < 0$? \pause \end{itemize} These could not \emph{all} happen, but one of them could. \pause When numerical maximum likelihood search leaves the parameter space, it may indicate that the model is incorrect. \pause Or it might be just a bad starting value. % Could the maximum of the likelihood function be outside the parameter space? \end{frame} \begin{frame} \frametitle{Testing hypotheses about $\boldsymbol{\theta}$} %\framesubtitle{} Some hypotheses are testable if the model is true, but direct likelihood ratio tests are out. All the theory depends on a unique maximum. \pause \vspace{5mm} Remember, \begin{displaymath} cov\left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \pause % \vspace{5mm} \begin{itemize} \item How would you test $H_0:\beta_1=\beta_2=0$? \pause \item If you did a large-sample likelihood ratio test, what would the degrees of freedom be? \end{itemize} \end{frame} \begin{frame} \frametitle{Lessons from this example} %\framesubtitle{} { \footnotesize \begin{itemize} \item A parameter may be identifiable at some points but not others. \pause \item Identifiability at infinitely many points is possible even if there are more unknowns than equations. But this can only happen on a set of volume zero. \pause \item Some parameters and functions of the parameters may be identifiable even when the whole parameter vector is not. \pause \item Lack of identifiability can produce multiple maxima of the likelihood function -- even infinitely many. \pause \item A model whose parameter vector is not identifiable may still be falsified by empirical data. \pause \item Numerical maximum likelihood search may leave the parameter space. This may be a sign that the model is false. It can happen when the parameter is identifiable, too. \pause \item Some hypotheses may be testable when the parameter is not identifiable, \pause but these will be hypotheses about functions of the parameter that are identifiable in the part of the parameter space where the null hypothesis is true. \pause $H_0:\beta_1=\beta_2=0$ \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/2053f22} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/2053f22}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(P_{\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame} \begin{frame} \frametitle{Two different parameter values yielding the same distribution of the data} \framesubtitle{Inverse function does not exist} \begin{center} \includegraphics[width=3in]{2to1} \end{center} \end{frame}