% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} % Uncomment next 2 lines instead of the first for article-style handout: % \documentclass[12pt]{article} % \usepackage{beamerarticle} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols at bottom % \usetheme{Berlin} % Displays sections on top % \usetheme{Warsaw} % Displays sections on top \usetheme{Frankfurt} % Displays sections on top: Fairly thin but swallows some material at bottom of crowded slides \usepackage[english]{babel} \usepackage{tikz} % for tikzpicture \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{The Zipper Example\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} % \begin{frame} % \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} % Davison Chapter 4, especially Sections 4.3 and 4.4 % \end{frame} \section{Preparation} \begin{frame}{Preparation: Indicator functions} \framesubtitle{Conditional expectation and the Law of Total Probability} $I_A(x)$ is the \emph{indicator function} for the set $A$. It is defined by \begin{displaymath} I_A(x) = \left\{ \begin{array}{ll} % ll means left left 1 & \mbox{for } x \in A \\ 0 & \mbox{for } x \notin A \end{array} \right. % Need that crazy invisible right period! \end{displaymath} \pause Also sometimes written $I(x \in A)$ \pause \begin{eqnarray*} E(I_A(X)) &=& \sum_x I_A(x) p(x) \mbox{, or}\\ & & \int_{-\infty}^\infty I_A(x) f(x) \, dx \\ &&\\ &=& P\{ X \in A \} \end{eqnarray*} \pause So the expected value of an indicator is a probability. \end{frame} \begin{frame} \frametitle{Applies to conditional probabilities too} %\framesubtitle{} {\LARGE \begin{eqnarray*} E(I_A(X)|Y) &=& \sum_x I_A(x) p(x|Y) \mbox{, or}\\ & & \int_{-\infty}^\infty I_A(x) f(x|Y) \, dx \\ &&\\ &=& Pr\{ X \in A|Y\} \end{eqnarray*} } % End size \pause So the conditional expected value of an indicator is a \emph{conditional} probability. \end{frame} \begin{frame} \frametitle{Double expectation: $E\left( g(X) \right) = E\left( E[g(X)|Y]\right)$} %\framesubtitle{} \pause \begin{displaymath} E\left(E[I_A(X)|Y]\right) = E[I_A(X)] = Pr\{ X \in A\} \mbox{, so} \end{displaymath} \pause \begin{eqnarray*} Pr\{ X \in A\} &=& E\left(E[I_A(X)|Y]\right) \pause \\ &=& E\left(Pr\{ X \in A|Y\}\right) \pause \\ &=& \int_{-\infty}^\infty Pr\{ X \in A|Y=y\} f_Y(y) \, dy \mbox{, or} \pause \\ & & \sum_y Pr\{ X \in A|Y=y\} p_Y(y) \end{eqnarray*} \pause This is known as the \emph{Law of Total Probability} \end{frame} \section{The Example} \begin{frame} \frametitle{The Zipper Example} %\framesubtitle{} Members of a Senior Kindergarten class (which we shall treat as a sample) try to zip their coats within one minute. We count how many succeed. \vspace{5mm} \pause How about a model? \vspace{5mm} \pause $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$, where $\theta$ is the probability of success. \end{frame} \section{A better model} \begin{frame} \frametitle{A better model than $Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$} %\framesubtitle{} \pause \begin{itemize} \item Obviously, the probability of success is not the same for each child. % \item Some kids are better at it than others, and some coats have easier zippers than others. \item Some are almost certain to succeed, and others have almost no chance. \end{itemize} \vspace{10mm} \pause \textbf{Alternative Model}: $Y_1, \ldots, Y_n$ are independent random variables, with $Y_i \sim B(1,\theta_i)$. \end{frame} \begin{frame} \frametitle{$Y_1, \ldots, Y_n$ independent $B(1,\theta_i)$} %\framesubtitle{} \pause \begin{itemize} \item This is a two-stage sampling model. \pause \item First, sample from a population in which each child has a personal probability of success. \pause \item Then for child $i$, use $\theta_i$ to generate success or failure. \pause \item Note that $\theta_1, \ldots, \theta_n$ are random variables with some probability distribution. \pause \item This distribution is supported on $[0,1]$ \pause \item How about a beta? \pause \end{itemize} \begin{displaymath} f(\theta) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta^{\alpha-1} (1-\theta)^{\beta-1} \end{displaymath} \end{frame} % Five pictures \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta1} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta2} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta3} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta4} \end{center} \end{frame} \begin{frame} \frametitle{Beta density is flexible} \begin{center} \includegraphics[width=3in]{beta5} \end{center} \end{frame} \begin{frame} \frametitle{Law of total probability} \framesubtitle{Double expectation} \begin{eqnarray*} P(Y_i=1) & = & \int_0^1 P(Y_i=1|\theta_i) \, f(\theta_i) \, d\theta_i \\ \pause & = & \int_0^1 \theta_i \, f(\theta_i) \, d\theta_i \\ \pause & = & \int_0^1 \theta_i \, \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta_i^{\alpha-1} (1-\theta_i)^{\beta-1} \, d\theta_i \\ \pause & = & \frac{\alpha}{\alpha+\beta} \end{eqnarray*} \end{frame} \section{Identifiability} \begin{frame} \frametitle{Distribution of the observable data} \begin{displaymath} P(\mathbf{Y}=\mathbf{y}|\alpha,\beta) = \prod_{i=1}^n \left(\frac{\alpha}{\alpha+\beta} \right)^{y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{1-y_i} \end{displaymath} \pause \begin{itemize} \item Distribution of the observable data depends on the parameters $\alpha$ and $\beta$ only through $\frac{\alpha}{\alpha+\beta}$. \pause \item Infinitely many $(\alpha,\beta)$ pairs yield the same distribution of the data. \pause \item How could you use the data to decide which one is right? \end{itemize} \end{frame} \begin{frame} \frametitle{Parameter Identifiability} \framesubtitle{The general idea} \begin{itemize} \item The parameters of the Zipper Model are not \emph{identifiable}. \pause \item The model parameters cannot be recovered from the distribution of the sample data. \pause \item And all you can ever learn from sample data is the distribution from which it comes. \pause \item So there will be problems using the sample data for estimation and inference about the parameters. \pause \item This is true \emph{even if the model is completely correct.} \end{itemize} \end{frame} \begin{frame} \frametitle{Definitions} \framesubtitle{Connected to parameter identifiability} \begin{itemize} \item A \emph{Statistical Model} is a set of assertions that partly specify the probability distribution of the observable data. \pause \item Suppose a statistical model implies $\mathbf{D} \sim P_{\boldsymbol{\theta}}, \boldsymbol{\theta} \in \Theta$. \pause If no two points in $\Theta$ yield the same probability distribution, then the parameter $\boldsymbol{\theta}$ is said to be \emph{identifiable.} \pause \item That is, identifiability means that $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ implies $P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$. \pause \item On the other hand, if there exist distinct $\boldsymbol{\theta}_1$ and $\boldsymbol{\theta}_2$ in $\Theta$ with $P_{\boldsymbol{\theta}_1} = P_{\boldsymbol{\theta}_2}$, the parameter $\boldsymbol{\theta}$ is \emph{not identifiable.} \end{itemize} \end{frame} \begin{frame} \frametitle{An equivalent definition} \framesubtitle{Equivalent to $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2 \Rightarrow P_{\boldsymbol{\theta}_1} \neq P_{\boldsymbol{\theta}_2}$} \pause \begin{itemize} \item The probability distribution is always a function of the parameter vector. \pause \item If that function is one-to-one, the parameter vector is identifiable, \pause because then $\boldsymbol{\theta}_1 \neq \boldsymbol{\theta}_2$ yielding the same distribution could not happen. \pause \item[] \item That is, if the parameter vector can somehow be recovered from the distribution of the data, it is identifiable. \end{itemize} \end{frame} \begin{frame} \frametitle{Theorem} If the parameter vector is not identifiable, consistent estimation for all points in the parameter space is impossible. \vspace{5mm} \begin{center} \includegraphics[width=3in]{consistent} \end{center} \pause \vspace{5mm} \begin{itemize} \item Suppose $\theta_1 \neq \theta_2$ but $P_{\theta_1} = P_{\theta_2}$ \pause \item $T_n = T_n(D_1, \ldots, D_n) \stackrel{p}{\rightarrow} \theta$ for all $\theta \in \Theta$. \pause \item Distribution of $T_n$ is identical for $\theta_1$ and $\theta_2$. \end{itemize} \end{frame} \begin{frame} \frametitle{Why don't we hear more about identifiability?} \pause \begin{itemize} \item Consistent estimation indirectly proves identifiability. \pause \item Because without identifiability, consistent estimation would be impossible. \pause \item Any \emph{function} of the parameter vector that can be estimated consistently is identifiable. \end{itemize} \end{frame} \section{Maximum Likelihood Fails} \begin{frame} \frametitle{Maximum likelihood fails for the Zipper Example} \framesubtitle{It has to fail.} \pause \begin{eqnarray*} L(\alpha,\beta) & = & \left(\frac{\alpha}{\alpha+\beta} \right)^{\sum_{i=1}^n y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{n-\sum_{i=1}^n y_i} \\ && \\ \pause \ell(\alpha,\beta) & = & \log \left( \left(\frac{\alpha}{\alpha+\beta} \right)^{\sum_{i=1}^n y_i} \left(1-\frac{\alpha}{\alpha+\beta} \right)^{n-\sum_{i=1}^n y_i} \right ) \end{eqnarray*} \pause Partially differentiate with respect to $\alpha$ and $\beta$, set to zero, and solve. \end{frame} \begin{frame} \frametitle{Two equations in two unknowns} %\framesubtitle{} \begin{eqnarray*} \frac{\partial\ell}{\partial\alpha} \stackrel{set}{=} 0 & \Rightarrow & \frac{\alpha}{\alpha+\beta} = \overline{y} \\ \pause \frac{\partial\ell}{\partial\beta} \stackrel{set}{=} 0 & \Rightarrow & \pause\frac{\alpha}{\alpha+\beta} = \overline{y} \end{eqnarray*} \vspace{5mm} \pause Any pair $(\alpha,\beta)$ with $\frac{\alpha}{\alpha+\beta} = \overline{y}$ will maximize the likelihood. \pause \vspace{5mm} The MLE is not unique. \end{frame} \begin{frame} \frametitle{What is happening geometrically?} % \framesubtitle{} \begin{columns} \column{0.5\textwidth} \begin{displaymath} \frac{\alpha}{\alpha+\beta} = \overline{y} \pause \Leftrightarrow \beta = \left( \frac{1-\overline{y}}{\overline{y}} \right) \alpha \end{displaymath} \pause \column{0.5\textwidth} \begin{tikzpicture}[scale=1/2] % \draw[help lines] (0,0) grid (10,10); % Graph paper % Draw axes \draw[->] (0,0) -- (9,0) coordinate (x axis); % From -- To With arrows! \draw[->] (0,0) -- (0,9) coordinate (y axis); % Label axes \draw (9 cm, 0) node[anchor=west] {\scriptsize $\alpha$}; \draw (0, 9 cm) node[anchor=south] {\scriptsize $\beta$}; % Tick marks and numbers % Draw the line \draw[thick] (0,0) -- (12,8); \end{tikzpicture} \end{columns} \end{frame} \begin{frame} \frametitle{Fisher Information: $\boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E\{-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(Y|\boldsymbol{\theta})\}\right]$} \framesubtitle{The Hessian of the minus log likelihood approximates $n$ times the Fisher Information.} \pause \begin{eqnarray*} \log f(Y|\alpha,\beta) & = & \log \left(\left(\frac{\alpha}{\alpha+\beta} \right)^Y \left(1-\frac{\alpha}{\alpha+\beta} \right)^{1-Y}\right) \\ \pause &&\\ & = & Y\log\alpha + (1-Y)\log\beta - \log(\alpha+\beta) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\boldsymbol{\mathcal{I}}(\alpha,\beta) = \left[E\{-\frac{\partial^2}{\partial\alpha\partial\beta} \log f(Y|\alpha,\beta)\}\right]$} \framesubtitle{Where $\log f(Y|\alpha,\beta) = Y\log\alpha + (1-Y)\log\beta - \log(\alpha+\beta)$} \pause \begin{eqnarray*} \boldsymbol{\mathcal{I}}(\alpha,\beta) & = & E\left( \begin{array}{c c} -\frac{\partial^2\log f}{\partial\alpha^2} & -\frac{\partial^2\log f}{\partial\alpha\partial\beta} \\ -\frac{\partial^2\log f}{\partial\beta\partial\alpha} & -\frac{\partial^2\log f}{\partial\beta^2} \end{array} \right) \\ \pause & = & \ldots \\ \pause & = & \frac{1}{(\alpha+\beta)^2} \left( \begin{array}{c c} \frac{\beta}{\alpha} & 1 \\ 1 & \frac{\alpha}{\beta} \end{array} \right) \end{eqnarray*} \vspace{5mm} \pause \begin{itemize} \item Determinant equals zero. \item The inverse does not exist. \item Large sample theory fails. \item Second derivative test fails. \item The likelihood is flat (in a particular direction). \end{itemize} \end{frame} \begin{frame} \frametitle{Look what has happened to us.} \pause %\framesubtitle{} \begin{itemize} \item We made an honest attempt to come up with a better model. \pause \item And it \emph{was} a better model. \pause \item But the result was disaster. \end{itemize} \end{frame} \begin{frame} \frametitle{There is some good news.} \pause %\framesubtitle{} {\small Remember from earlier that by the Law of Total Probability, \begin{displaymath} P(Y_i = 1) = \int_0^1 \theta_i \, f(\theta_i) \, d\theta_i \pause = E(\Theta_i) \end{displaymath} \pause \begin{itemize} \item Even when the probability distribution of the (random) probability of success is completely unknown, \pause \item We can estimate its expected value (call it $\mu$) consistently with $\overline{Y}_n$. \pause \item So that \emph{function} of the unknown probability distribution is identifiable. \pause \item And often that's all we care about anyway, say for comparing group means. \pause \item So the usual procedures, based on a model nobody can believe (Bernoulli)\pause, are actually informative about a much more realistic model whose parameter is not fully identifiable. \pause \item We don't often get this lucky. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{One more question about the parametric version} %\framesubtitle{} What would it take to estimate $\alpha$ and $\beta$ successfully? \pause \begin{itemize} \item Get the children to try zipping their coats twice, say on two consecutive days. \item Assume their ability does not change, and conditionally on their ability, the two tries are independent. \item That will do it. % \pause There are 4 probabilities that add to one; three are free. Write them as functions of $\alpha$ and $\beta$ and solve for $\alpha$ and $\beta$. \pause \item[] \item This kind of thing often happens. When the parameters of a reasonable model are not identifiable, maybe you can design a different way of collecting data so that the parameters \emph{can} be identified. \end{itemize} \end{frame} \begin{frame} \frametitle{Moral of the story} %\framesubtitle{} \pause \begin{itemize} \item If you think up a better model for standard kinds of data, the parameters of the model may not be identifiable. You need to check. \pause \item The problem is not with the model. It's with the data. \pause \item The solution is better \emph{research design}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf17} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf17}} \end{frame} \end{document} # R work for beta density plots # alpha=2, beta=4 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,2, 4) tstring = expression(paste("Beta density with ", alpha,"=2 and ", beta,"=4")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=4, beta=2 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,4, 2) tstring = expression(paste("Beta density with ", alpha,"=4 and ", beta,"=2")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=3, beta=3 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,3, 3) tstring = expression(paste("Beta density with ", alpha,"=3 and ", beta,"=3")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=1/2, beta=1/2 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,1/2, 1/2) tstring = expression(paste("Beta density with ", alpha,"=1/2 and ", beta,"=1/2")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) # alpha=1/2, beta=1/4 x = seq(from =0.00, to = 1.00, by = 0.01) y = dbeta(x,1/2, 1/4) tstring = expression(paste("Beta density with ", alpha,"=1/2 and ", beta,"=1/4")) ystring = expression(paste("f(",theta,")")) plot(x,y,type='l' ,xlab=expression(theta),ylab=ystring, main = tstring) tstring = expression(paste("Beta density with ", alpha,"=3 and ", beta,"=3")) plot(x,y,type='l' ,ylab="f(x)", main = cat(expression(alpha),"=2") ######################################################## x = seq(from =0.00, to = 1.00, by = 0.01) a = 2; b = 4 # alpha and beta y = dbeta(x,a,b) tstring = expression(paste("Beta density with ", alpha,"=2 and ", beta,"=4")) plot(x,y,type='l' ,ylab="f(x)", main = tstring) plot(x,y,type='l' ,ylab="f(x)", main = tstring,asp=1) plot(x,y,type='l' ,ylab="f(x)", main = tstring, asp=1,xlim=c(0,1),ylim=c(0,3)) plot(x,y,type='l' ,ylab="f(x)", main = tstring,xlim=c(0,1),ylim=c(0,3))