% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Including Measurement Error in the Regression Model: A First Try\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2015} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Moment Structure Equations} \begin{frame} \frametitle{Moments and Moment Structure Equations} Model $D \sim P_\theta$ \pause \vspace{10mm} \begin{itemize} \item \emph{Moments} of a distribution are quantities such $E(X)$, $E(Y^2)$, $Var(X)$, $E(X^2Y^2)$, $Cov(X,Y)$, and so on. \pause \item \emph{Moment structure equations} are a set of equations expressing moments of the distribution of the data in terms of the model parameters. \pause ~~~~~ $m = g(\theta)$ \pause \item If the moments involved are limited to variances and covariances, the moment structure equations are called \emph{covariance structure equations}. \end{itemize} \end{frame} \begin{frame} \frametitle{Important process} \pause \begin{itemize} \item Calculate the moments of the distribution (usually means, variances and covariances) in terms of the model parameters, obtaining a system of moment structure equations.\pause ~~~~~ $m = g(\theta)$ \pause \item Solve the moment structure equations for the parameters, expressing the parameters in terms of the moments. \pause ~~~~~ $\theta = g^{-1}(m)$ \pause \item Method of Moments: $\widehat{\theta} = g^{-1}(\widehat{m})$ \pause \item By SLLN and Continuous mapping, $\widehat{\theta}\stackrel{a.s.}{\rightarrow} \theta$ \pause \item So even if we're not going to use the Method of Moments, \pause solving $\theta = g^{-1}(m)$ shows that consistent estimation is possible. \end{itemize} \end{frame} \begin{frame} \frametitle{Recall multivariate multiple regression} \begin{displaymath} \mathbf{Y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i \end{displaymath} {\scriptsize where \begin{itemize} \item[] $\mathbf{Y}_i$ is an $q \times 1$ random vector of observable response variables, so the regression can be multivariate; there are $q$ response variables. \item[] $\boldsymbol{\beta}_0$ is a $q \times 1$ vector of unknown constants, the intercepts for the $q$ regression equations. There is one for each response variable. \item[] $\mathbf{X}_i$ is a $p \times 1$ observable random vector; there are $p$ explanatory variables. $\mathbf{X}_i$ has expected value $\boldsymbol{\mu}_x$ and variance-covariance matrix $\boldsymbol{\Phi}$, a $p \times p$ symmetric and positive definite matrix of unknown constants. \item[] $\boldsymbol{\beta}_1$ is a $q \times p$ matrix of unknown constants. These are the regression coefficients, with one row for each response variable and one column for each explanatory variable. \item[] $\boldsymbol{\epsilon}_i$ is the error term of the latent regression. It is an $q \times 1$ multivariate normal random vector with expected value zero and variance-covariance matrix $\boldsymbol{\Psi}$, a $q \times q$ symmetric and positive definite matrix of unknown constants. $\boldsymbol{\epsilon}_i$ is independent of $\mathbf{X}_i$. \end{itemize} } % End size %\vspace{3mm} \pause $\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$ \end{frame} \begin{frame} \frametitle{Data vectors are multivariate normal} \begin{displaymath} \mathbf{D}_i = \left( \begin{array}{c} \mathbf{X}_i \\ \hline \mathbf{Y}_i \end{array} \right) \end{displaymath} \vspace{5mm} \begin{itemize} \item $\mathbf{D}_i \sim N(\boldsymbol{\mu}, \boldsymbol{\Sigma})$ \pause \item Write $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ as partitioned matrices. \end{itemize} \end{frame} \begin{frame} \frametitle{Write $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ as partitioned matrices} \pause \begin{displaymath} \boldsymbol{\mu} = \left( \begin{array}{c} E(\mathbf{X}_i) \\ \hline E(\mathbf{Y}_i) \end{array} \right) = \left( \begin{array}{c} \boldsymbol{\mu}_1 \\ \hline \boldsymbol{\mu}_2 \end{array} \right) \end{displaymath} \pause and \renewcommand{\arraystretch}{1.5} \begin{displaymath} \boldsymbol{\Sigma} = V\left( \begin{array}{c} \mathbf{X}_i \\ \hline \mathbf{Y}_i \end{array} \right) = \left( \begin{array}{c|c} V(\mathbf{X}_i) & C(\mathbf{X}_i,\mathbf{Y}_i) \\ \hline C(\mathbf{X}_i,\mathbf{Y}_i)^\top & V(\mathbf{Y}_i) \end{array} \right) = \left( \begin{array}{c|c} \boldsymbol{\Sigma}_{11} & \boldsymbol{\Sigma}_{12} \\ \hline \boldsymbol{\Sigma}_{12}^\top & \boldsymbol{\Sigma}_{22} \end{array} \right) \end{displaymath} \pause \renewcommand{\arraystretch}{1.0} \vspace{10mm} $\mathbf{m} = \left( \boldsymbol{\mu}_1, \boldsymbol{\mu}_2, \boldsymbol{\Sigma}_{11}, \boldsymbol{\Sigma}_{12}, \boldsymbol{\Sigma}_{22}\right)$ \end{frame} \begin{frame} \frametitle{Moment structure equations} \framesubtitle{Based on $\mathbf{D}_i = (\mathbf{X}_i^\top|\mathbf{Y}_i^\top)^\top$ with $\mathbf{Y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{X}_i + \boldsymbol{\epsilon}_i$} \begin{itemize} \item[] $\boldsymbol{\theta} = (\boldsymbol{\beta}_0, \boldsymbol{\mu}_x, \boldsymbol{\Phi}, \boldsymbol{\beta}_1, \boldsymbol{\Psi})$ \item[] $\mathbf{m} = \left( \boldsymbol{\mu}_1, \boldsymbol{\mu}_2, \boldsymbol{\Sigma}_{11}, \boldsymbol{\Sigma}_{12}, \boldsymbol{\Sigma}_{22}\right)$ \end{itemize} \pause \begin{eqnarray*} \boldsymbol{\mu}_1 & = & \boldsymbol{\mu}_x \\ \boldsymbol{\mu}_2 & = & \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \boldsymbol{\mu}_x \nonumber \\ \boldsymbol{\Sigma}_{11} & = & \boldsymbol{\Phi} \nonumber \\ \boldsymbol{\Sigma}_{12} & = & \boldsymbol{\Phi\beta}_1^\top \nonumber \\ \boldsymbol{\Sigma}_{22} & = & \boldsymbol{\beta}_1 \boldsymbol{\Phi\beta}_1^\top + \boldsymbol{\Psi}. \nonumber \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Solve moment structure equations for the parameters} \framesubtitle{$\theta = g^{-1}(m)$} \pause \begin{eqnarray*} \boldsymbol{\beta}_0 & = & \boldsymbol{\mu}_2 - \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1} \ \boldsymbol{\mu}_1 \\ \boldsymbol{\mu}_x & = & \boldsymbol{\mu}_1 \nonumber \\ \boldsymbol{\Phi}_{~} & = & \boldsymbol{\Sigma}_{11} \nonumber \\ \boldsymbol{\beta}_1 & = & \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1} \nonumber \\ \boldsymbol{\Psi}_{~} & = & \boldsymbol{\Sigma}_{22} - \boldsymbol{\Sigma}_{12}^\top \boldsymbol{\Sigma}_{11}^{-1}\boldsymbol{\Sigma}_{12} \nonumber \end{eqnarray*} \pause \vspace{10mm} \begin{itemize} \item Just put hats on everything to get MOM estimates. \item Same as the MLEs in this case by Invariance. \end{itemize} \end{frame} \begin{frame} \frametitle{But let's admit it} {\LARGE In most applications, the explanatory variables are measured with error. } % End size \end{frame} \section{A first try} \begin{frame} \frametitle{A first try at including measurement error in the explanatory variable} \pause Independently for $i=1, \ldots, n$, let \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& \nu + X_i + e_i, \end{eqnarray*} where \pause \begin{itemize} \item $X_i$ is normally distributed with mean $\mu_x$ and variance $\phi>0$ \item $\epsilon_i$ is normally distributed with mean zero and variance $\psi>0$ \item $e_i$ is normally distributed with mean zero and variance $\omega>0$ \item $X_i, e_i, \epsilon_i$ are all independent. \end{itemize} \pause Observable data are just the pairs $(W_i,Y_i)$ for $i=1, \ldots, n$. \end{frame} \begin{frame} \frametitle{Model implies that the $(W_i,Y_i)$ are independent bivariate normal} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i $ \linebreak $W_i = \nu + X_i + e_i$ } \pause with \begin{displaymath} E\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \nu + \mu_x \\ \beta_0 + \beta_1\mu_x \end{array} \right), \end{displaymath} \pause and variance covariance matrix \pause \begin{displaymath} V\left( \begin{array}{c} W_i \\ Y_i \end{array} \right) = \boldsymbol{\Sigma} = [\sigma_{i,j}] = \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ \beta_1 \phi & \beta_1^2 \phi + \psi \end{array} \right). \end{displaymath} \end{frame} \begin{frame} \frametitle{Big problem revealed by the moment structure equations} \pause % \framesubtitle{} \begin{eqnarray*} \mu_1 & = & \mu_x + \nu \\ \mu_2 & = & \beta_0 + \beta_1\mu_x \\ \sigma_{1,1} & = & \phi+\omega \\ \sigma_{1,2} & = & \beta_1 \phi \\ \sigma_{2,2} & = & \beta_1^2 \phi + \psi \end{eqnarray*} \pause \vspace{3mm} $\boldsymbol{\theta} = (\beta_0, \beta_1, \mu_x, \phi, \psi, \nu, \omega)$ \pause \vspace{5mm} It is impossible to solve these five equations for the seven model parameters. \end{frame} \begin{frame} \frametitle{Impossible to solve the moment structure equations for the parameters} \begin{itemize} \item Even with perfect knowledge of the probability distribution of the data (and for the multivariate normal, that means knowing $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, period), \pause it would be impossible to know the model parameters. \pause \item All data can ever tell you is the approximate distribution from which they come. \pause \item So how could we expect to successfully \emph{estimate} $\boldsymbol{\theta}$ based on sample data? \end{itemize} \end{frame} \begin{frame} \frametitle{A numerical example} \pause {\small \begin{eqnarray*} \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) & = & \left( \begin{array}{c} \mu_x+\nu \\ \beta_0 + \beta_1\mu_x \end{array} \right) \\ \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ & \sigma_{22} \end{array} \right) & = & \left( \begin{array}{c c} \phi+\omega & \beta_1 \phi \\ & \beta_1^2 \phi + \psi \end{array} \right) \end{eqnarray*} \pause \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|c|} \hline & $\mu_x$ & $\beta_0$ & $\nu$ & $\beta_1$ & $\phi$ & $\omega$ & $\psi$ \\ \hline $\boldsymbol{\theta}_1$ & 0 & 0 & 0 & 1 & 2 & 2 & 3 \\ \hline $\boldsymbol{\theta}_2$ & 0 & 0 & 0 & 2 & 1 & 3 & 1 \\ \hline \end{tabular} \end{center} \pause Both $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ imply a bivariate normal distribution with mean zero and covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left[ \begin{array}{r r} 4 & 2 \\ 2 & 5 \end{array} \right], \end{displaymath} \pause and thus the same distribution of the sample data. } % End size \end{frame} \section{Identifiability} \begin{frame} \frametitle{Parameter Identifiability} %\framesubtitle{} \begin{itemize} \item No matter how large the sample size, it will be impossible to decide between $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$, because they imply exactly the same probability distribution of the observable data. \pause \item The problem here is that the parameters of the regression are not \emph{identifiable}. \pause \item The model parameters cannot be recovered from the distribution of the sample data. \pause \item And all you can ever learn from sample data is the distribution from which it comes. \pause \item So there will be problems using the sample data for estimation and inference. \pause \item This is true even when \emph{the model is completely correct.} \end{itemize} \end{frame} \begin{frame} \frametitle{Definitions} \framesubtitle{Connected to parameter identifiability} \pause \begin{itemize} \item A \emph{Statistical Model} is a set of assertions that partly specify the probability distribution of a set of observable data. \pause \item Suppose a statistical model implies $\mathbf{D} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be \emph{identifiable.} \pause \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$. \pause \item On the other hand, if there exist distinct $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ in $\Theta$ with $P_{\boldsymbol{\theta}_1} = P_{\boldsymbol{\theta}_2}$, the parameter $\boldsymbol{\theta}$ is \emph{not identifiable.} \end{itemize} \end{frame} \begin{frame} \frametitle{An equivalent definition of identifiability} \pause \framesubtitle{Full proof of equivalence deferred for now} \begin{itemize} \item If the parameter vector is a function of the probability distribution of the observable data, it is identifiable. \pause \item[] \item That is, if the parameter vector can somehow be recovered from the distribution of the data, it is identifiable. \pause \item[] \item If two different parameter values gave the same distribution of the data, this would be impossible because functions yield only one value. \end{itemize} % A picture would be good here. \end{frame} \begin{frame} \frametitle{Regression models with no measurement error} \pause %\framesubtitle{} \begin{itemize} \item The mean and covariance matrix are functions of the probability distribution (calculate expected values). \pause \item We solved for all the parameters from the mean and covariance matrix. \pause \item Therefore the parameters are a function of the probability distribution. \pause \item Thus they are identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Identifiability is a big concept} \pause %\framesubtitle{} \begin{itemize} \item It means \emph{knowability} of the parameters from the distribution of the data. \pause \item We will do mathematical proofs that show whether certain information can be known. \pause \item Call it the \textbf{algebra of the knowable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation for all points in the parameter space is impossible. \pause \vspace{5mm} \begin{center} \includegraphics[width=3in]{consistent} \end{center} \vspace{5mm} \pause \begin{itemize} \item Let $\theta_1 \neq \theta_2$ but $P_{\theta_1} = P_{\theta_2}$ \pause \item Suppose $T_n = T_n(D_1, \ldots, D_n)$ is a consistent estimator of $\theta$ for all $\theta \in \Theta$, in particular for $\theta_1$ and $\theta_2$. \pause \item So the distribution of $T_n$ is identical for $\theta_1$ and $\theta_2$. \end{itemize} \end{frame} \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame} \begin{frame} \frametitle{Some sample questions will be based on this model:} Let $W = X + e$, where \begin{itemize} \item $X \sim N(\mu,\phi)$ \item $e \sim N(0,\omega)$ \item $X$ and $e$ are independent. \item Only $W$ is observable ($X$ is a latent variable). \end{itemize} \pause \vspace{5mm} How does this fit the definition of a \emph{model}? \end{frame} \begin{frame} \frametitle{Sample questions} {\scriptsize Let $W = X + e$, where \begin{itemize} \item $X \sim N(\mu,\phi)$ \item $e \sim N(0,\omega)$ \item $X$ and $e$ are independent. \item Only $W$ is observable ($X$ is a latent variable). \end{itemize} \pause } % End size \vspace{5mm} In the following questions, you may use the fact that the normal distribution corresponds uniquely to the pair $(\mu,\sigma^2)$. \pause \begin{enumerate} \item What is the parameter vector $\boldsymbol{\theta}$? \pause \item What is the parameter space $\Theta$? \pause \item What is the probability distribution of the observable data? \pause \item Give the moment structure equations. \pause \item Either prove that the parameter is identifiable, or show by an example that it is not. A simple numerical example is best. \pause \item Give two \emph{functions} of the parameter vector that are identifiable. \end{enumerate} \end{frame} \begin{frame} \frametitle{Pointwise identifiability} \framesubtitle{As opposed to global identifiability} \pause \begin{itemize} \item The parameter is said to be \emph{identifiable} at a point $\boldsymbol{\theta}_0$ if no other point in $\Theta$ yields the same probability distribution as $\boldsymbol{\theta}_0$. \pause \item That is, $\boldsymbol{\theta} \neq \boldsymbol{\theta}_0$ implies $P_{\boldsymbol{\theta}} \neq P_{\boldsymbol{\theta}_0}$ for all $\boldsymbol{\theta} \in \Theta$. \pause \item Let $g(\boldsymbol{\theta})$ be a function of the parameter vector. If $g(\boldsymbol{\theta}_0) \neq g(\boldsymbol{\theta})$ implies $P_{\boldsymbol{\theta}_0} \neq P_{\boldsymbol{\theta}}$ for all $\boldsymbol{\theta} \in \Theta$, then the function $g(\boldsymbol{\theta})$ is said to be identifiable at the point $\boldsymbol{\theta}_0$. \pause \end{itemize} If the parameter (or function of the parameter) is identifiable at at every point in $\Theta$, it is identifiable according to the earlier definitions. \end{frame} \begin{frame} \frametitle{Local identifiability} %\framesubtitle{} \begin{itemize} \item[] The parameter is said to be \emph{locally identifiable} at a point $\boldsymbol{\theta}_0$ if there is a neighbourhood of points surrounding $\boldsymbol{\theta}_0$, none of which yields the same probability distribution as $\boldsymbol{\theta}_0$. \pause \item[] % If there is a neighborhood of $\theta_0$ with $P_\theta \neq P_{\theta_0}$ for all $\theta \neq \theta_0$ in the neighborhood, the parameter is said to be \emph{locally identifiable} at $\theta_0$. \item[] If the parameter is identifiable at a point, it is locally identifiable there, but the converse is not true. \end{itemize} \end{frame} \section{Parameter Count Rule} \begin{frame} \frametitle{The Parameter Count Rule} \framesubtitle{A necessary but not sufficient condition for identifiability} \pause Suppose identifiability is to be decided based on a set of moment structure equations. If there are more parameters than equations, the set of points where the parameter vector is identifiable occupies a set of volume zero in the parameter space. \pause \vspace{5mm} So a necessary condition for parameter identifiability is that there be at least as many moment structure equations as parameters. \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{Two latent explanatory variables} \pause \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} \pause \vspace{5mm} where \begin{itemize} \item $X_1$, $X_2$, $\epsilon_1$ and $\epsilon_2$ are independent normal random variables with expected value zero, and \item $Var(X_1)=Var(X_2)=1$, $Var(\epsilon_1)=\psi_1$ and $Var(\epsilon_2)=\psi_2$. \item Only $Y_1$ and $Y_2$ are observable. \end{itemize} \pause \vspace{5mm} The parameter vector is $\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$. \end{frame} \begin{frame} \frametitle{Calculate the covariance matrix of $(Y_1,Y_2)^\top$} \framesubtitle{Expected value is (zero, zero)} \begin{eqnarray*} %\label{countingex} Y_1 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_1 \\ Y_2 & = & \beta_1 X_1 + \beta_2 X_2 + \epsilon_2, \nonumber \end{eqnarray*} \pause \vspace{5mm} {\LARGE \begin{eqnarray*} \boldsymbol{\Sigma} & = & \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \\ \pause && \\ & = & \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Covariance structure equations} % \framesubtitle{Obtained by calculating $\boldsymbol{\Sigma}$} \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause \vspace{5mm} \begin{itemize} \item Three equations in 4 unknowns. \pause \item Parameter count rule does \emph{not} say that a solution is impossible. \pause \item It says that \emph{the set of points in the parameter space where there is a unique solution (so the parameters are all identifiable) occupies a set of volume zero}. \pause \item Are there any such points at all? \end{itemize} \end{frame} \begin{frame} \frametitle{Try to solve for the parameters} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} % Why is this important? Covariance structure equations: \vspace{5mm} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \pause \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \pause \item So those \emph{functions} of the parameter vector are identifiable. \pause \item What about $\beta_1$ and $\beta_2$? \end{itemize} \end{frame} \begin{frame} \frametitle{Can we solve for $\beta_1$ and $\beta_2$?} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} %{\small \begin{eqnarray*} % \label{identeq} \sigma_{1,1} & = & \beta_1^2 + \beta_2^2 + \psi_1 \\ \sigma_{1,2} & = & \beta_1^2 + \beta_2^2 \nonumber \\ \sigma_{2,2} & = & \beta_1^2 + \beta_2^2 + \psi_2 \nonumber \end{eqnarray*} \pause %} % End size %\vspace{5mm} \begin{itemize} \item $\sigma_{1,2} = 0$ ~ if and only if ~ Both $\beta_1=0$ and $\beta_2=0$. \pause \item The set of points where all four parameters can be recovered from the covariance matrix is \emph{exactly} the set of points where the parameter vector is identifiable. \pause \item It is \begin{displaymath} \{(\beta_1, \beta_2, \psi_1, \psi_2): \beta_1=0, \beta_2=0, \psi_1>0, \psi_2>0 \} \end{displaymath} \pause \item A set of infinitely many points in $\mathbb{R}^4$ \pause \item A set of volume zero, as the theorem says. \end{itemize} \end{frame} \begin{frame} \frametitle{Suppose $\beta_1^2 + \beta_2^2 \neq 0$} \framesubtitle{This is the case ``almost everywhere" in the parameter space.} \pause The set of infinitely many points $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} \pause All produce the covariance matrix \begin{displaymath} \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{1,1} & \sigma_{1,2} \\ \sigma_{1,2} & \sigma_{2,2} \end{array} \right) \end{displaymath} \pause And hence the same bivariate normal distribution of $(Y_1,Y_2)^\top$. \end{frame} \begin{frame} \frametitle{Why are there infinitely many points in this set?} $\{(\beta_1, \beta_2, \psi_1, \psi_2)\}$ such that \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2} \neq 0$ \end{itemize} \pause \vspace{15mm} Because $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ is the equation of a circle with radius $\sqrt{\sigma_{1,2}}$. \end{frame} \begin{frame} \frametitle{Maximum likelihood estimation} \framesubtitle{$\boldsymbol{\theta}= (\beta_1, \beta_2, \psi_1, \psi_2)$} { \scriptsize \begin{eqnarray*} L(\boldsymbol{\mu,\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{x}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{x}}-\boldsymbol{\mu}) \right\} \\ L(\boldsymbol{\Sigma}) &=& |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-n} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + \overline{\mathbf{x}}^\top \boldsymbol{\Sigma}^{-1} \overline{\mathbf{x}} \right\} \end{eqnarray*} \pause } % End size \vspace{15mm} Can write likelihood as either $L(\boldsymbol{\Sigma})$ or $L(\boldsymbol{\Sigma}(\boldsymbol{\theta})) = L_2(\boldsymbol{\theta})$. \pause \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ll} \beta_1^2 + \beta_2^2 + \psi_1 & \beta_1^2 + \beta_2^2 \\ \beta_1^2 + \beta_2^2 & \beta_1^2 + \beta_2^2 + \psi_2 \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Likelihood $L_2(\boldsymbol{\theta})$ has non-unique maximum} %\framesubtitle{} \begin{itemize} \item $L(\boldsymbol{\Sigma})$ has a unique maximum at $\boldsymbol{\Sigma} = \widehat{\boldsymbol{\Sigma}}$. \pause \item For every positive definite $\boldsymbol{\Sigma}$ with $\sigma_{1,2} \neq 0$, there are infinitely many $\boldsymbol{\theta} \in \Theta$ which produce that $\boldsymbol{\Sigma}$, and have the same height of the likelihood. \pause \item This includes $\widehat{\boldsymbol{\Sigma}}$. \pause \item So there are infinitely many points $\boldsymbol{\theta}$ in $\Theta$ with $L_2(\boldsymbol{\theta}) = L(\widehat{\boldsymbol{\Sigma}})$. \pause \item A circle in $\mathbb{R}^4$. \end{itemize} \end{frame} \begin{frame} \frametitle{A circle in $\mathbb{R}^4$ where the likelihood is maximal} %\framesubtitle{} {\LARGE $\{(\beta_1, \beta_2, \psi_1, \psi_2)\} \subset \mathbb{R}^4$ such that \begin{itemize} \item $\psi_1 = \widehat{\sigma}_{1,1}-\widehat{\sigma}_{1,2}$ \item $\psi_2 = \widehat{\sigma}_{2,2} - \widehat{\sigma}_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \widehat{\sigma}_{1,2}$ \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{What would happen in the numerical search for $\widehat{\boldsymbol{\theta}}$ if \dots} % \framesubtitle{Pathological} \begin{itemize} \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{1,1}$? \item $\widehat{\sigma}_{1,2} > \widehat{\sigma}_{2,2}$? \item $\widehat{\sigma}_{1,2} < 0$? \end{itemize} These could not \emph{all} happen, but one of them could. What would it mean? \pause \vspace{5mm} Remember, \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause \vspace{5mm} Could the maximum of the likelihood function be outside the parameter space? \end{frame} \begin{frame} \frametitle{Testing hypotheses about $\boldsymbol{\theta}$} %\framesubtitle{} It is possible. Remember, the model implies \begin{itemize} \item $\psi_1 = \sigma_{1,1}-\sigma_{1,2}$ \item $\psi_2 = \sigma_{2,2}-\sigma_{1,2}$ \item $\beta_1^2 + \beta_2^2 = \sigma_{1,2}$ \end{itemize} \pause But likelihood ratio tests are out. All the theory depends on a unique maximum. \end{frame} \begin{frame} \frametitle{Lessons from this example} %\framesubtitle{} { \footnotesize \begin{itemize} \item A parameter may be identifiable at some points but not others. \pause \item Identifiability at infinitely many points is possible even if there are more unknowns than equations. But this can only happen on a set of volume zero. \pause \item Some parameters and functions of the parameters may be identifiable even when the whole parameter vector is not. \pause \item Lack of identifiability can produce multiple maxima of the likelihood function -- even infinitely many. \pause \item A model whose parameter vector is not identifiable may still be falsified by empirical data. \pause \item Numerical maximum likelihood search may leave the parameter space. This may be a sign that the model is false. It can happen when the parameter is identifiable, too. \pause \item Some hypotheses may be testable when the parameter is not identifiable, but these will be hypotheses about functions of the parameter that \emph{are} identifiable. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s15} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s15}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability of \emph{functions} of the parameter vector} If $g(\boldsymbol{\theta}_1) \neq g(P_{\boldsymbol{\theta}_2)$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$ for all $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ in $\Theta$, the the function $g(\boldsymbol{\theta})$ is said to be identifiable. \end{frame}