\documentclass[serif]{beamer} % Get Computer Modern math font. \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Including Measurement Error in the Regression Model: A First Try\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Models with no measurement error} \begin{frame} \frametitle{Unconditional regression without measurement error} %\framesubtitle{} Independently for $i=1, \ldots, n$, let \begin{equation*} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i \end{equation*} where \begin{itemize} \item $X_i$ is normally distributed with mean $\mu_x$ and variance $\phi>0$ \item $\epsilon_i$ is normally distributed with mean zero and variance $\psi>0$ \item $X_i$ and $\epsilon_i$ are independent. \end{itemize} \end{frame} \begin{frame} \frametitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$} %\framesubtitle{Using the Centering Rule} Pairs $(X_i,Y_i)$ are bivariate normal, with \begin{displaymath} E\left( \begin{array}{c} X_i \\ Y_i \end{array} \right) = \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0 + \beta_1\mu_x \end{array} \right), \end{displaymath} and variance covariance matrix \begin{displaymath} V\left( \begin{array}{c} X_i \\ Y_i \end{array} \right) = \boldsymbol{\Sigma} = [\sigma_{i,j}] = \left[ \begin{array}{c c} \phi & \beta_1 \phi \\ \beta_1 \phi & \beta_1^2 \phi + \psi \end{array} \right]. \end{displaymath} \end{frame} \begin{frame} \frametitle{Moments and Moment Structure Equations} \begin{itemize} \item \emph{Moments} of a distribution are quantities such $E(X)$, $E(Y^2)$, $Var(X)$, $E(X^2Y^2)$, $Cov(X,Y)$, and so on. \item \emph{Moment structure equations} are a set of equations expressing moments of the distribution of the data in terms of the model parameters. \item If the moments involved are limited to variances and covariances, the moment structure equations are called \emph{covariance structure equations}. \end{itemize} \end{frame} \begin{frame} \frametitle{Moment structure equations for the regression model} {\small \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$} \begin{eqnarray*} \mu_1 & = & \mu_x \\ \mu_2 & = & \beta_0 + \beta_1\mu_x \\ \sigma_{1,1} & = & \phi \\ \sigma_{1,2} & = & \beta_1 \phi \\ \sigma_{2,2} & = & \beta_1^2 \phi + \psi \end{eqnarray*} Solve 5 equations in 5 unknowns to get \begin{eqnarray*} \mu_x & = & \mu_1 \\ \beta_0 & = & \mu_2 - \frac{\sigma_{1,2}}{\sigma_{1,1}}\mu_1 \\ \beta_1 & = & \frac{\sigma_{1,2}}{\sigma_{1,1}} \\ \phi & = & \sigma_{1,1} \\ \psi & = & \sigma_{2,2} - \frac{\sigma_{1,2}^2}{\sigma_{1,1}}. \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Nice one-to-one relationship} %\framesubtitle{} \begin{itemize} \item The parameters of the normal regression model stand in a one-to-one-relationship with the mean and covariance matrix of the bivariate normal distribution of the observable data. \item There is the same number of moments (means, variances and covariances) as parameters in the regression model. \item In fact, the two sets of parameter values are 100\% equivalent; they are just different ways of expressing the same thing. \item By the \emph{Invariance Principle}, the MLEs have the same relationship. \item Just put hats on everything. \end{itemize} \end{frame} \begin{frame} \frametitle{Invariance Principle: MLE of a function is that function of the MLE} \framesubtitle{No need for numerical maximum likelihood in this case} \begin{eqnarray*} \widehat{\mu}_x & = & \widehat{\mu}_1 = \overline{x} \\ \widehat{\beta}_0 & = & \overline{y} - \frac{\widehat{\sigma}_{1,2}} {\widehat{\sigma}_{1,1}}\overline{x} \\ \widehat{\beta}_1 & = & \frac{\widehat{\sigma}_{1,2}} {\widehat{\sigma}_{1,1}} \\ \widehat{\phi} & = & \widehat{\sigma}_{1,1} \\ \widehat{\psi} & = & \widehat{\sigma}_{2,2} - \frac{\widehat{\sigma}_{1,2}^2} {\widehat{\sigma}_{1,1}}. \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Important process} \begin{itemize} \item Calculate the moments of the distribution (usually means, variances and covariances) in terms of the model parameters, obtaining a system of moment structure equations. \item Solve the moment structure equations for the parameters, expressing the parameters in terms of the moments. \end{itemize} \vspace{10mm} Solutions can be used to estimate parameters. Later, we will do this to check whether successful estimation is even possible at all. \end{frame} \begin{frame} \frametitle{Multivariate multiple regression} \begin{displaymath} \mathbf{Y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i \end{displaymath} {\footnotesize where \begin{itemize} \item[] $\mathbf{Y}_i$ is an $q \times 1$ random vector of observable response variables, so the regression can be multivariate; there are $q$ response variables. \item[] $\boldsymbol{\beta}_0$ is a $q \times 1$ vector of unknown constants, the intercepts for the $q$ regression equations. There is one for each response variable. \item[] $\mathbf{X}_i$ is a $p \times 1$ observable random vector; there are $p$ explanatory variables. $\mathbf{X}_i$ has expected value $\boldsymbol{\mu}_x$ and variance-covariance matrix $\boldsymbol{\Phi}$, a $p \times p$ symmetric and positive definite matrix of unknown constants. \item[] $\boldsymbol{\beta}_1$ is a $q \times p$ matrix of unknown constants. These are the regression coefficients, with one row for each response variable and one column for each explanatory variable. \item[] $\boldsymbol{\epsilon}_i$ is the error term of the latent regression. It is an $q \times 1$ multivariate normal random vector with expected value zero and variance-covariance matrix $\boldsymbol{\Psi}$, a $q \times q$ symmetric and positive definite matrix of unknown constants. $\boldsymbol{\epsilon}_i$ is independent of $\mathbf{X}_i$. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Data vectors are multivariate normal} \begin{displaymath} \mathbf{D}_i = \left( \begin{array}{c} \mathbf{X}_i \\ \hline \mathbf{Y}_i \end{array} \right) \end{displaymath} \begin{itemize} \item $\mathbf{D}_i \sim N(\boldsymbol{\mu}, \boldsymbol{\Sigma})$ \item Write $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ as partitioned matrices (matrices of matrices). \end{itemize} \end{frame} \begin{frame} \frametitle{Write $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ as partitioned matrices} \begin{displaymath} \boldsymbol{\mu} = \left( \begin{array}{c} E(\mathbf{X}_i) \\ \hline E(\mathbf{Y}_i) \end{array} \right) = \left( \begin{array}{c} \boldsymbol{\mu}_1 \\ \hline \boldsymbol{\mu}_2 \end{array} \right) \end{displaymath} and \begin{displaymath} \boldsymbol{\Sigma} = V\left( \begin{array}{c} \mathbf{X}_i \\ \hline \mathbf{Y}_i \end{array} \right) = \left( \begin{array}{c|c} V(\mathbf{X}_i) & C(\mathbf{X}_i,\mathbf{Y}_i) \\ \hline C(\mathbf{X}_i,\mathbf{Y}_i)^\prime & V(\mathbf{Y}_i) \end{array} \right) = \left( \begin{array}{c|c} \boldsymbol{\Sigma}_{11} & \boldsymbol{\Sigma}_{12} \\ \hline \boldsymbol{\Sigma}_{12}^\prime & \boldsymbol{\Sigma}_{22} \end{array} \right) \end{displaymath} \vspace{5mm} \begin{itemize} \item[] Calculate $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ in terms of model parameters to get moment structure equations. \item[] $\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$ \end{itemize} \end{frame} \begin{frame} \frametitle{Moment structure equations} Based on \begin{itemize} \item[] $\mathbf{Y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i$ \item[] $\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$ \end{itemize} \begin{eqnarray*} \boldsymbol{\mu}_1 & = & \boldsymbol{\mu}_x \\ \boldsymbol{\mu}_2 & = & \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \boldsymbol{\mu}_x \nonumber \\ \boldsymbol{\Sigma}_{11} & = & \boldsymbol{\Phi} \nonumber \\ \boldsymbol{\Sigma}_{12} & = & \boldsymbol{\Phi\beta}_1^\prime \nonumber \\ \boldsymbol{\Sigma}_{22} & = & \boldsymbol{\beta}_1 \boldsymbol{\Phi\beta}_1^\prime + \boldsymbol{\Psi}. \nonumber \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Solve moment structure equations for the parameters} \framesubtitle{$\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$} \begin{eqnarray*} \boldsymbol{\beta}_0 & = & \boldsymbol{\mu}_2 - \boldsymbol{\Sigma}_{12}^\prime \boldsymbol{\Sigma}_{11}^{-1} \ \boldsymbol{\mu}_1 \\ \boldsymbol{\mu}_x & = & \boldsymbol{\mu}_1 \nonumber \\ \boldsymbol{\Phi}_{~} & = & \boldsymbol{\Sigma}_{11} \nonumber \\ \boldsymbol{\beta}_1 & = & \boldsymbol{\Sigma}_{12}^\prime \boldsymbol{\Sigma}_{11}^{-1} \nonumber \\ \boldsymbol{\Psi}_{~} & = & \boldsymbol{\Sigma}_{22} - \boldsymbol{\Sigma}_{12}^\prime \boldsymbol{\Sigma}_{11}^{-1}\boldsymbol{\Sigma}_{12} \nonumber \end{eqnarray*} \vspace{10mm} Just put hats on everything to get MLEs. \end{frame} \begin{frame} \frametitle{But let's admit it} {\LARGE In most applications, the explanatory variables are measured with error. } % End size \end{frame} \section{A first try} \begin{frame} \frametitle{A first try at including measurement error} %\framesubtitle{In the explanatory variable} Independently for $i=1, \ldots, n$, let \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& \nu + X_i + e_i, \end{eqnarray*} where \begin{itemize} \item $X_i$ is normally distributed with mean $\mu_x$ and variance $\phi>0$ \item $\epsilon_i$ is normally distributed with mean zero and variance $\psi>0$ \item $e_i$ is normally distributed with mean zero and variance $\omega>0$ \item $X_i, e_i, \epsilon_i$ are all independent. \end{itemize} Data are just the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \end{frame} \begin{frame} \frametitle{Model implies that the $(W_i,Y_i)$ are independent bivariate normal} \begin{displaymath} E\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \mu_x+\nu \\ \beta_0 + \beta_1\mu_x \end{array} \right), \end{displaymath} and variance covariance matrix \begin{displaymath} V\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\Sigma} = [\sigma_{i,j}] = \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ \beta_1 \phi & \beta_1^2 \phi + \psi \end{array} \right). \end{displaymath} \end{frame} \begin{frame} \frametitle{Big problem} \framesubtitle{Revealed by the moment structure equations} \begin{eqnarray*} \mu_1 & = & \mu_x + \nu \\ \mu_2 & = & \beta_0 + \beta_1\mu_x \\ \sigma_{1,1} & = & \phi+\omega \\ \sigma_{1,2} & = & \beta_1 \phi \\ \sigma_{2,2} & = & \beta_1^2 \phi + \psi \end{eqnarray*} \vspace{10mm} It is impossible to solve these five equations for the seven model parameters. \end{frame} \begin{frame} \frametitle{Impossible to solve the moment structure equations for the parameters} Even with perfect knowledge of the probability distribution of the data (for the multivariate normal, that means knowing $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, period), it would be impossible to know the model parameters. \end{frame} \begin{frame} \frametitle{A numerical example} {\small \begin{eqnarray*} \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) & = & \left( \begin{array}{c} \mu_x+\nu \\ \beta_0 + \beta_1\mu_x \end{array} \right) \\ \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ & \sigma_{22} \end{array} \right) & = & \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ & \beta_1^2 \phi + \psi \end{array} \right) \end{eqnarray*} \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|c|} \hline & $\mu_x$ & $\beta_0$ & $\nu$ & $\beta_1$ & $\phi$ & $\omega$ & $\psi$ \\ \hline $\boldsymbol{\theta}_1$ & 0 & 0 & 0 & 1 & 2 & 2 & 3 \\ \hline $\boldsymbol{\theta}_2$ & 0 & 0 & 0 & 2 & 1 & 3 & 1 \\ \hline \end{tabular} \end{center} Both $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ imply a bivariate normal distribution with mean zero and covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left[ \begin{array}{r r} 4 & 2 \\ 2 & 5 \end{array} \right], \end{displaymath} and thus the same distribution of the sample data. } % End size \end{frame} \section{Identifiability} \begin{frame} \frametitle{Parameter Identifiability} %\framesubtitle{} \begin{itemize} \item No matter how large the sample size, it will be impossible to decide between $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$, because they imply exactly the same probability distribution of the observable data. \item The problem here is that the parameters of the regression are not \emph{identifiable}. \item The model parameters cannot be recovered from the distribution of the sample data. \item And all you can ever learn from sample data is the distribution from which it comes. \item So there will be problems using the sample data for estimation and inference. \item This is true even when \emph{the model is completely correct.} \end{itemize} \end{frame} \begin{frame} \frametitle{Definitions} \framesubtitle{Connected to parameter identifiability} \begin{itemize} \item A \emph{Statistical Model} is a set of assertions that partly specify the probability distribution of a set of observable data. \item Suppose a statistical model implies $\mathbf{D} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be \emph{identifiable.} \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ \item On the other hand, if there exist distinct $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ in $\Theta$ with $P_{\boldsymbol{\theta}_1} = P_{\boldsymbol{\theta}_2}$, the parameter $\boldsymbol{\theta}$ is \emph{not identifiable.} \end{itemize} \end{frame} \begin{frame} \frametitle{An equivalent definition} \framesubtitle{Proof of equivalence deferred for now} \begin{itemize} \item If the parameter vector is a function of the probability distribution of the observable data, it is identifiable. \item[] \item That is, if the parameter vector can somehow be recovered from the distribution of the data, it is identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Regression models with no measurement error} %\framesubtitle{} \begin{itemize} \item The mean and covariance matrix are functions of the probability distribution (calculate expected values). \item We solved for all the parameters from the mean and covariance matrix. \item Therefore the parameters are a function of the probability distribution. \item Thus they are identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Identifiability is a big concept} %\framesubtitle{} \begin{itemize} \item It means \emph{knowability} of the parameters from the distribution of the data. \item We will do mathematical proofs that show whether certain information can be known. \item Call it the \textbf{algebra of the knowable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation for all points in the parameter space is impossible. \vspace{5mm} \begin{center} \includegraphics[width=3in]{consistent} \end{center} \vspace{5mm} \begin{itemize} \item Suppose $\theta_1 \neq \theta_2$ but $P_{\theta_1} = P_{\theta_2}$ \item $T_n = T_n(D_1, \ldots, D_n)$ is a consistent estimator of $\theta$ for all $\theta \in \Theta$. \item Distribution of $T_n$ is identical for $\theta_1$ and $\theta_2$. \end{itemize} \end{frame} \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame} \begin{frame} \frametitle{Some sample questions will be based on this model:} Let $W = X + e$, where \begin{itemize} \item $X \sim N(\mu,\phi)$ \item $e \sim N(0,\omega)$ \item $X$ and $e$ are independent. \item Only $W$ is observable ($X$ is a latent variable). \end{itemize} \vspace{5mm} How does this fit the definition of a \emph{model}? \end{frame} \begin{frame} \frametitle{Sample questions} {\scriptsize Let $W = X + e$, where \begin{itemize} \item $X \sim N(\mu,\phi)$ \item $e \sim N(0,\omega)$ \item $X$ and $e$ are independent. \item Only $W$ is observable ($X$ is a latent variable). \end{itemize} } % End size \vspace{5mm} In the following questions, you may use the fact that the normal distribution corresponds uniquely to the pair $(\mu,\sigma^2)$. \begin{enumerate} \item What is the parameter vector $\boldsymbol{\theta}$? \item What is the parameter space $\Theta$? \item What is the probability distribution of the observable data? \item Give the moment structure equations. \item Either prove that the parameter is identifiable, or show by an example that it is not. A simple numerical example is best. \item Give two \emph{functions} of the parameter vector that are identifiable. \end{enumerate} \end{frame} \begin{frame} \frametitle{Pointwise identifiability} \framesubtitle{As opposed to global identifiability} \begin{itemize} \item The parameter is said to be \emph{identifiable} at a point $\boldsymbol{\theta}_0$ if no other point in $\Theta$ yields the same probability distribution as $\boldsymbol{\theta}_0$. \item That is, $\boldsymbol{\theta} \neq \boldsymbol{\theta}_0$ implies $P_{\boldsymbol{\theta}} \neq P_{\boldsymbol{\theta}_0}$ for all $\boldsymbol{\theta} \in \Theta$. \item Let $g(\boldsymbol{\theta})$ be a function of the parameter vector. If $g(\boldsymbol{\theta}_0) \neq g(\boldsymbol{\theta})$ implies $P_{\boldsymbol{\theta}_0} \neq P_{\boldsymbol{\theta}}$ for all $\boldsymbol{\theta} \in \Theta$, then the function $g(\boldsymbol{\theta})$ is said to be identifiable at the point $\boldsymbol{\theta}_0$. \end{itemize} If the parameter (or function of the parameter) is identifiable at at every point in $\Theta$, it is identifiable according to the earlier definitions. \end{frame} % There's local identifiability, too -- later \section{Parameter Count Rule} \begin{frame} \frametitle{The Parameter Count Rule} \framesubtitle{A necessary but not sufficient condition for identifiability} Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. \vspace{5mm} So a necessary condition for parameter identifiability is that there be at least as many moment structure equations as parameters. \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{Two latent explanatory variables} \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} \vspace{5mm} where \begin{itemize} \item $X_1$, $X_2$, $\epsilon_1$ and $\epsilon_2$ are independent normal random variables with expected value zero, and \item $Var(X_1)=Var(X_2)=1$, $Var(\epsilon_1)=\psi_1$ and $Var(\epsilon_2)=\psi_2$. \item The parameter vector is $\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$. \item Only $Y_1$ and $Y_2$ are observable. \end{itemize} \end{frame} \begin{frame} \frametitle{Calculate the covariance matrix of $(Y_1,Y_2)^\prime$} % \framesubtitle{Expected value is (zero, zero)} \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} \vspace{5mm} {\LARGE \begin{eqnarray*} \boldsymbol{\Sigma} & = & \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \\ && \\ & = & \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Covariance structure equations} % \framesubtitle{Obtained by calculating $\boldsymbol{\Sigma}$} \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \vspace{5mm} \begin{itemize} \item Three equations in 4 unknowns \item Parameter count rule does \emph{not} say that a solution is impossible. \item It says that \emph{the set of points in the parameter space where there is a unique solution (so the parameters are all identifiable) occupies a set of volume zero}. \item Are there any such points at all? \end{itemize} \end{frame} \begin{frame} \frametitle{Try to solve for the parameters} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} Why is this important? \vspace{5mm} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} %} % End size \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item So those \emph{functions} of the parameter vector are identifiable. \item What about $\beta_1$ and $\beta_2$? \end{itemize} \end{frame} \begin{frame} \frametitle{Can we solve for $\beta_1$ and $\beta_2$?} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} %} % End size %\vspace{5mm} \begin{itemize} \item $\sigma_{1,2} = 0$ ~ if and only if ~ Both $\beta_1=0$ and $\beta_2=0$. \item The set of points where all four parameters can be recovered from the covariance matrix is \emph{exactly} the set of points where the parameter vector is identifiable. \item It is \begin{displaymath} \{(\beta_1, \beta_2, \psi_1, \psi_2): \beta_1=0, \beta_2=0, \psi_1>0, \psi_2>0 \} \end{displaymath} \item A set of infinitely many points in $\mathbb{R}^4$ \item A set of volume zero, as the theorem says. \end{itemize} \end{frame} \begin{frame} \frametitle{Suppose $\beta_1^2 + \beta_2^2 \neq 0$} \framesubtitle{This is the case ``almost everywhere" in the parameter space.} The set of infinitely many points $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} All produce the covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \end{displaymath} And hence the same bivariate normal distribution of $(Y_1,Y_2)^\prime$. \end{frame} \begin{frame} \frametitle{Why are there infinitely many points in this set?} $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} \vspace{15mm} Because $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ is the equation of a circle with radius $\sqrt{\sigma_{1,2}}$. \end{frame} \begin{frame} \frametitle{Maximum likelihood estimation} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} { \scriptsize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \\ L(\boldsymbol{\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-n} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + \overline{\mathbf{x}}^\prime \boldsymbol{\Sigma}^{-1} \overline{\mathbf{x}} \right\} \end{eqnarray*} } % End size \vspace{15mm} Can write likelihood as either $L(\boldsymbol{\Sigma})$ or $L(\boldsymbol{\Sigma}(\boldsymbol{\theta})) = L_2(\boldsymbol{\theta})$. \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Likelihood $L_2(\boldsymbol{\theta})$ has non-unique maximum} %\framesubtitle{} \begin{itemize} \item $L(\boldsymbol{\Sigma})$ has a unique maximum at $\boldsymbol{\Sigma} = \widehat{\boldsymbol{\Sigma}}$ \item For every positive definite $\boldsymbol{\Sigma}$ with $\sigma_{1,2} \ neq 0$, there are infinitely many $\boldsymbol{\theta} \in \Theta$ which produce that $\boldsymbol{\Sigma}$, and have the same height of the likelihood. \item This includes $\widehat{\boldsymbol{\Sigma}}$. \item So there are infinitely many points $\boldsymbol{\theta}$ in $\Theta$ with $L_2(\boldsymbol{\theta}) = L(\widehat{\boldsymbol{\Sigma}})$. \item A circle in $\mathbb{R}^4$ \end{itemize} \end{frame} \begin{frame} \frametitle{A circle in $\mathbb{R}^4$ where the likelihood is maximal} %\framesubtitle{} {\LARGE $\{(\beta_1, \beta_2, \psi_1, \psi_2)\} \subset \mathbb{R}^4$ such that \begin{itemize} \item $\psi_1 = \widehat{\sigma}_{1,1}-\widehat{\sigma}_{1,2}$ \item $\psi_2 = \widehat{\sigma}_{2,2} - \widehat{\sigma}_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \widehat{\sigma}_{1,2}$ \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{What if \dots} % \framesubtitle{Pathological} \begin{itemize} \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{1,1}$? \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{2,2}$? \item $\widehat{\sigma}_{1,2} < 0$? \end{itemize} These could not \emph{all} happen, but one of them could. What would it mean? \vspace{5mm} Remember, \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \vspace{5mm} What would happen in a numerical search for $\widehat{\boldsymbol{\theta}}$? % Could the maximum of the likelihood function be outside the parameter space? \end{frame} \begin{frame} \frametitle{Testing hypotheses about $\boldsymbol{\theta}$} %\framesubtitle{} It is possible. Remember, the model implies \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \end{frame} \begin{frame} \frametitle{Lessons from this example} %\framesubtitle{} { \footnotesize \begin{itemize} \item A parameter may be identifiable at some points but not others. \item Identifiability at infinitely many points is possible even if there are more unknowns than equations. But this can only happen on a set of volume zero. \item Some parameters and functions of the parameters may be identifiable even when the whole parameter vector is not. \item Lack of identifiability can produce multiple maxima of the likelihood function -- even infinitely many. \item A model whose parameter vector is not identifiable may still be falsified by empirical data. \item Numerical maximum likelihood search may leave the parameter space. This may be a sign that the model is false. It can happen when the parameter is identifiable, too. \item Some hypotheses may be testable when the parameter is not identifiable, but these will be hypotheses about functions of the parameter that \emph{are} identifiable. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s31}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(P_{\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame}