% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Statistical models and estimation\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Models} \begin{frame} \frametitle{Statistical model} \framesubtitle{Most good statistical analyses are based on a \emph{model} for the data.} \pause A \emph{statistical model} is a set of assertions that partly specify the probability distribution of the observable data. The specification may be direct or indirect. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where\pause \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables, \pause not observable. \pause \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \end{itemize} \pause A model is not the same thing as the \emph{truth}. \end{frame} \begin{frame} \frametitle{Statistical models leave something unknown} \framesubtitle{Otherwise they are probability models} \pause \begin{itemize} \item The unknown part of the model for the data is called the \emph{parameter}. \pause \item Usually, parameters are (vectors of) numbers. \pause \item Usually denoted by $\theta$ or $\boldsymbol{\theta}$ or other Greek letters. \pause \item Parameters are unknown constants. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause The parameter space is $\Theta = \{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \pause The parameter space is $\Theta = \{(\beta_0, \ldots, \beta_k, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameters need not be numbers} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a continuous distribution with unknown distribution function $F(x)$. \pause \begin{itemize} \item The parameter is the unknown distribution function $F(x)$. \pause \item The parameter space is a space of distribution functions. \pause \item We may be interested only in a \emph{function} of the parameter, like \pause \end{itemize} \begin{displaymath} \mu = \int_{-\infty}^\infty x f(x) \, dx \end{displaymath} \pause \vspace{3mm} The rest of $F(x)$ is just a nuisance parameter. \end{frame} \begin{frame} \frametitle{General statement of a statistical model} \framesubtitle{$D$ is for Data} {\LARGE \begin{displaymath} D \sim P_\theta, ~~~ \theta \in \Theta \end{displaymath} } % End size \vspace{3mm} \pause \begin{itemize} \item Both $D$ and $\theta$ could be vectors \pause \item For example, \begin{itemize} \item $D = \mathbf{Y}_1, \ldots \mathbf{Y}_n$ independent multivariate normal. \pause \item $\theta = (\boldsymbol{\mu,\Sigma})$. \pause \item $P_\theta$ is the joint distribution function of $\mathbf{Y}_1, \ldots \mathbf{Y}_n$, with joint density \pause \end{itemize} \end{itemize} \begin{displaymath} f(\mathbf{y}_1, \ldots \mathbf{y}_n) = \prod_{i=1}^n f(\mathbf{y}_i;\boldsymbol{\mu,\Sigma}) \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimation} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item We don't know $\theta$. \pause \item We never know $\theta$. \pause \item All we can do is guess. \pause \item Estimate $\theta$ (or a function of $\theta$) based on the observable data. \pause \item $T$ is an \emph{estimator} of $\theta$ (or a function of $\theta$): $T=T(D)$ \pause \end{itemize} For example, \begin{itemize} \item $D = X_1, \ldots, X_n \stackrel{i.i.d}{\sim} N(\mu,\sigma^2)$ \pause ~~~~~~~~~~~~~~~~~ $T = (\overline{X},S^2)$. \pause % Stupid visual layout \item For an ordinary multiple regression model, $T=(\widehat{\boldsymbol{\beta}},MSE)$ \pause \end{itemize} $T$ is a \emph{statistic}, a random variable (vector) that can be computed from the data without knowing the values of any unknown parameters. \end{frame} \section{MOM} \begin{frame} \frametitle{Parameter estimation} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \pause \begin{itemize} \item Estimate $\theta$ with $T=T(D)$. \pause \item How do we get a recipe for $T$? \pause Guess? \pause \item It's good to be systematic. Lots of methods are available. \pause \item We will consider two: Method of moments and maximum likelihood. \end{itemize} \end{frame} \begin{frame} \frametitle{Moments} \framesubtitle{Based on a random sample like $(X_1,Y_1), \ldots, (X_n,Y_n)$} \pause \begin{itemize} \item Moments are quantities like $E\{X_i\}$, $E\{X_i^2\}$, $E\{X_iY_i\}$, $E\{W_iX_i^2Y_i^3\}$, etc. \pause \item \emph{Central} moments are moments of \emph{centered} random variables: \pause \begin{itemize} \item[] $E\{(X_i-\mu_x)^2\}$ \pause \item[] $E\{(X_i-\mu_x)(Y_i-\mu_y)\}$ \pause \item[] $E\{(X_i-\mu_x)^2(Y_i-\mu_y)^3(Z_i-\mu_z)^2\}$ \pause \end{itemize} \item These are all \emph{population} moments. \end{itemize} \end{frame} \begin{frame} \frametitle{Population moments and sample moments} % \framesubtitle{Assume $X_i$ and $Y_i$ values can be observed.} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{ll} \hline Population moment & Sample moment \\ \hline \pause $E\{X_i\}$ & $\frac{1}{n}\sum_{i=1}^n X_i$ \\ \pause $E\{X_i^2\}$ & $\frac{1}{n}\sum_{i=1}^n X_i^2$ \\ \pause $E\{X_iY_i\}$ & $\frac{1}{n}\sum_{i=1}^n X_iY_i$ \\ \pause $E\{(X_i-\mu_x)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)^2$ \\ \pause $E\{(X_i-\mu_x)(Y_i-\mu_y)\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)(Y_i-\overline{Y}_n)$ \\ \pause $E\{(X_i-\mu_x)(Y_i-\mu_y)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)(Y_i-\overline{Y}_n)^2$ \\ \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \end{frame} \begin{frame} \frametitle{Estimation by the Method of Moments (MOM)} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \pause \item Population moments are a function of $\theta$. \pause \item Find $\theta$ as a function of the population moments. \pause \item Estimate $\theta$ with that function of the \emph{sample} moments. \pause \end{itemize} Symbolically, \pause \begin{itemize} \item Let $m$ denote a vector of population moments. \pause \item $\widehat{m}$ is the corresponding vector of sample moments. \pause \item Find $m = g(\theta)$ \pause \item Solve for $\theta$, obtaining $\theta= g^{-1}(m)$. \pause \item Let $\widehat{\theta} = g^{-1}(\widehat{m})$. \pause \end{itemize} It doesn't matter if you solve first or put hats on first. \end{frame} \begin{frame} \frametitle{Example: $X_1, \ldots, X_n \stackrel{i.i.d}{\sim} U(0,\theta)$} \framesubtitle{$f(x) = \frac{1}{\theta}$ for $0 ybar = 60/100; ybar [1] 0.6 \end{verbatim} %\verb:f(x): \end{frame} \begin{frame} \frametitle{Maximum likelihood for the univariate normal} %\framesubtitle{} \pause Let $X_1, \ldots, X_n \stackrel{i.i.d}{\sim} N(\mu,\sigma^2)$. \pause \vspace{5mm} \begin{eqnarray*} \ell(\theta) & = & \ln \prod_{i=1}^n \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}\frac{(x_i-\mu)^2}{\sigma^2}}\\ \pause & = & \ln\left(\sigma^{-n}(2\pi)^{-\frac{n}{2}} e^{-\frac{1}{2\sigma^2} \sum_{i=1}^n(x_i-\mu)^2}\right) \\ \pause & = & -n\ln\sigma - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2 \\ % \pause % & = & -\frac{n}{2}\ln\sigma^2 - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Differentiate with respect to the parameters} \framesubtitle{$\ell(\theta) = -n\ln\sigma - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2$} \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\mu} & = & - \frac{1}{2\sigma^2}\sum_{i=1}^n2(x_i-\mu)(-1) \pause \stackrel{set}{=} 0 \\ \pause & \Rightarrow & \mu= \overline{x} \\ \pause &&\\ \frac{\partial\ell}{\partial\sigma} & = & \pause - \frac{n}{\sigma} - \frac{1}{2} \sum_{i=1}^n(x_i-\mu)^2 (-2\sigma^{-3}) \\ \pause & = & - \frac{n}{\sigma} + \frac{1}{\sigma^3} \sum_{i=1}^n(x_i-\mu)^2 \pause ~~ \stackrel{set}{=} ~~ 0 \\ \pause & \Rightarrow & \sigma^2 = \frac{1}{n}\sum_{i=1}^n(x_i-\mu)^2 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Substituting} %\framesubtitle{} Setting derivaties to zero, we have obtained \pause \begin{displaymath} \mu = \overline{x} \mbox{ and } \sigma^2 = \frac{1}{n}\sum_{i=1}^n(x_i-\mu)^2, \mbox{ so} \end{displaymath} \pause {\LARGE \begin{eqnarray*} \widehat{\mu} & = & \overline{X} \\ \pause \widehat{\sigma}^2 & = & \frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2 \\ \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Gamma Example} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a Gamma distribution with parameters $\alpha>0$ and $\beta>0$ \pause {\LARGE \begin{eqnarray*} f(x;\alpha,\beta) & = & \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1} \\ \pause && \\ \Theta & = & \{(\alpha,\beta): \alpha>0, \beta>0 \} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Log Likelihood} \framesubtitle{$f(x;\alpha,\beta) = \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1}$} \begin{eqnarray*} \ell(\alpha,\beta) &=& \ln \prod_{i=1}^n \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x_i/\beta} x_i^{\alpha - 1} \nonumber \\ \pause &=& \ln \left( \beta^{-n\alpha} \, \Gamma(\alpha)^{-n} \exp(-\frac{1}{\beta}\sum_{i=1}^n x_i) \left(\prod_{i=1}^n x_i \right)^{\alpha-1} \right) \\ \pause &=& -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Differentiate with respect to the parameters} \framesubtitle{$\ell(\theta) = -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i$} \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\beta} & \stackrel{set}{=} & 0~~ \pause \Rightarrow ~~ \alpha\beta = \overline{x} \\ \pause &&\\ \frac{\partial\ell}{\partial\alpha} & = & -n\ln\beta -n \frac{\partial}{\partial\alpha} \ln \Gamma(\alpha) + \sum_{i=1}^n \ln x_i \\ \pause & = & \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} \pause ~~\stackrel{set}{=}~~0 \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Solve for $\alpha$} %\framesubtitle{} {\LARGE \begin{displaymath} \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} = 0 \end{displaymath} \pause } % End size \vspace{8mm} where \begin{displaymath} \Gamma(\alpha) = \int_0^\infty e^{-t}t^{\alpha-1} \, dt. \end{displaymath} \pause \vspace{8mm} Nobody can do it. \end{frame} \begin{frame} \frametitle{Maximize the likelihood numerically with software} \framesubtitle{Usually this is in high dimension} \begin{center} \includegraphics[width=4in]{Likelihood} \end{center} \pause \begin{itemize} \item It's like trying to find the top of a mountain by walking uphill blindfolded. \pause \item You might stop at a local maximum. \pause \item The starting place is very important. \pause \item The final answer is a number (or vector of numbers). \pause \item There is no explicit formula for the MLE. \end{itemize} \end{frame} \begin{frame} \frametitle{There is a lot of useful theory} \framesubtitle{Even without an explicit formula for the MLE} \begin{center} \includegraphics[width=4in]{Likelihood} \end{center} \pause {\footnotesize \begin{itemize} \item MLE is asymptotically normal. \pause \item Variance of the MLE is deeply related to the curvature of the log likelihood at the MLE. \pause \item The more curvature, the smaller the variance. \pause \item The variance of the MLE can be estimated from the curvature (using the Fisher Information). \pause \item Basis of tests and confidence intervals. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Comparing MOM and MLE} \pause %\framesubtitle{} \begin{itemize} \item Sometimes they are identical, sometimes not. \pause \item If the model is right they are usually close for large samples. \pause \item Both are asymptotically normal. \pause \item Estimates of the variance are easy to obtain for both. \pause \item Small variance of an estimator is good. \pause \item As $n \rightarrow \infty$, nothing can beat the MLE. \pause \item Except that the MLE depends on a very specific distribution. \pause \item And sometimes the dependence matters. \pause \item In such cases, MOM is preferable. \end{itemize} \end{frame} \section{Invariance} \begin{frame} \frametitle{The Invariance principle of maximum likelihood estimation} \framesubtitle{Also applies to Method of Moments estimation} \pause \begin{itemize} \item The Invariance Principle of maximum likelihood estimation says that \emph{the MLE of a function is that function of the MLE, \pause provided the function is one-to-one.} \pause \item An example comes first, followed by formal details. \end{itemize} \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{Of the invariance principle} Let $D_1, \ldots, D_n$ be a random sample from a Bernoulli distribution (1=Yes, 0=No) with parameter $\theta, 0<\theta<1$. \pause The parameter space is $\Theta = (0,1)$, \pause and the likelihood function is \pause \begin{displaymath} L(\theta) = \prod_{i=1}^n \theta^{d_i} (1-\theta)^{1-d_i} = \theta^{\sum_{i=1}^n d_i} (1-\theta)^{n-\sum_{i=1}^n d_i}. \end{displaymath} \pause Differentiating the log likelihood with respect to $\theta$, setting the derivative to zero and solving yields the usual estimate $\widehat{\theta} = \overline{d}$, the sample proportion. \end{frame} \begin{frame} \frametitle{Re-parameterize} %\framesubtitle{} \begin{itemize} \item Write the model in terms of the \emph{odds} of $D_i=1$, \pause a re-parameterization that is often useful in categorical data analysis. \pause \item Denote the odds by $\theta^\prime$. \pause \item The definition of odds is \pause \end{itemize} \begin{equation*}\label{odds} \theta^\prime = \frac{\theta}{1-\theta} = g(\theta). \end{equation*} \pause \begin{itemize} \item As $\theta$ ranges from zero to one, $\theta^\prime$ ranges from zero to infinity. \pause \item So there is a new parameter space: $\theta^\prime \in \Theta^\prime = (0,\infty)$. \end{itemize} \end{frame} \begin{frame} \frametitle{Likelihood function in terms of $\theta^\prime = \frac{\theta}{1-\theta}$} \pause % \framesubtitle{} First solve for $\theta$, obtaining $\theta = \frac{\theta^\prime}{1+\theta^\prime} = g^{-1}(\theta^\prime)$. \pause The likelihood in terms of $\theta^\prime$ is then \pause \begin{eqnarray*}\label{oddlike} L(g^{-1}(\theta^\prime)) &=& \theta^{\sum_{i=1}^n d_i} (1-\theta)^{n-\sum_{i=1}^n d_i} \\ \pause &=& \left( \frac{\theta^\prime}{1+\theta^\prime}\right)^{\sum_{i=1}^n d_i} \left(1 - \frac{\theta^\prime}{1+\theta^\prime}\right)^{n-\sum_{i=1}^n d_i} \\ \pause &=& \left( \frac{\theta^\prime}{1+\theta^\prime}\right)^{\sum_{i=1}^n d_i} \left(\frac{1+\theta^\prime - \theta^\prime}{1+\theta^\prime}\right)^{n-\sum_{i=1}^n d_i} \\ \pause &=& \frac{ \theta^{\prime\sum_{i=1}^n d_i} }{ (1+\theta^\prime)^n }. \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$L(g^{-1}(\theta^\prime)) = L^\prime(\theta^\prime)) = \frac{ \theta^{\prime\sum_{i=1}^n d_i} } { (1+\theta^\prime)^n }$} \framesubtitle{See how re-parameterization changes the likelihood function} \pause \begin{itemize} \item Could differentiate the log likelihood, set the derivative to zero, and solve for $\theta^\prime$. \pause \item The point of the invariance principle is that this is unnecessary. \pause \item The maximum likelihood estimator of $g(\theta) = \frac{\theta}{1-\theta}$ is $g(\widehat{\theta})$, \pause so that \end{itemize} \begin{displaymath} \widehat{\theta^\prime} = \frac{\widehat{\theta}} {1-\widehat{\theta}} = \frac{\overline{d}} {1-\overline{d}} ~. \end{displaymath} \end{frame} \begin{frame} \frametitle{Theorem} \framesubtitle{See text for a proof. The one-to-one part is critical.} \pause Let $g: \Theta \rightarrow \Theta^\prime$ be a one-to-one re-parameterization, with the maximum likelihood estimate $\widehat{\theta}$ satisfying $L(\widehat{\theta}) > L(\theta)$ for all $\theta \in \Theta$ with $\theta \neq \widehat{\theta}$. Then $L^\prime(g(\widehat{\theta})) > L^\prime(\theta^\prime)$ for all $\theta^\prime \in \Theta^\prime$ with $\theta^\prime \neq g(\widehat{\theta})$. \pause \vspace{3mm} In other words \pause \begin{itemize} \item The MLE of $g(\theta)$ is $g(\widehat{\theta})$. \pause \item $\widehat{g(\theta)} = g(\widehat{\theta})$. \pause \item The MLE of $\theta^\prime$ is $g(\widehat{\theta})$. \pause \item $\widehat{\theta}^\prime = g(\widehat{\theta})$. \end{itemize} \end{frame} \begin{frame} \frametitle{Re-parameterization in general} %\framesubtitle{} The parameters of common statistical models are written in a standard way, % based on on meaningfulness, convenience and tradition, but other equivalent parameterizations are sometimes useful. \pause Suppose $X_i \sim N(\mu,\sigma^2)$. \pause Have \begin{displaymath} \widehat{\theta} = (\overline{X}, \frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2) \end{displaymath} \pause \begin{itemize} \item Write $X_i \sim N(\mu,\sigma)$. \pause \begin{itemize} \item $g(\theta) = (\theta_1,\sqrt{\theta_2})$ \pause \item $\widehat{\theta}^\prime = \left(\overline{X}, \sqrt{\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2}\right)$ \pause \end{itemize} \item Write $X_i \sim N(\mu,\tau)$, \pause where $\tau = 1/\sigma^2$ \pause is called the \emph{precision}. \pause \begin{itemize} \item $g(\theta) = (\theta_1,1/\theta_2)$ \pause \item $\widehat{\theta}^\prime = \left(\overline{X}, \frac{n}{\sum_{i=1}^n(X_i-\overline{X})^2}\right)$ \end{itemize} \end{itemize} \end{frame} \section{Consistency} \begin{frame} \frametitle{Consistency} %\framesubtitle{} \begin{itemize} \item The idea is large-sample accuracy. \pause \item As $n \rightarrow \infty$, you get the truth. \pause \item It's a kind of limit, but with probability involved. % \item It's the least you can ask. \end{itemize} \end{frame} \begin{frame} \frametitle{The setting} %\framesubtitle{} \begin{itemize} \item Let $T_1, T_2, \ldots$ be a sequence of random variables. \pause \item Main application: $T_n$ is an estimator of $\theta$ based on a sample of size $n$. \pause \item Think $T_n = \overline{X}_n = \frac{1}{n}\sum_{i=1}^nX_i$. \pause \item Generalize to random vectors, soon. \end{itemize} \end{frame} \begin{frame} \frametitle{Convergence in Probability} We say that $T_n$ converges \emph{in probability} to the constant $\theta$, and write $T_n \stackrel{p}{\rightarrow} \theta$ if \pause for all $\epsilon>0$, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|T_n-\theta|<\epsilon \}=1 \end{displaymath} } \pause Convergence in probability to $\theta$ means \pause no matter how small the interval around $\theta$, \pause for large enough $n$ (that is, for all $n>N$) \pause the probability of getting a value of $T_n$ that near to $\theta$ (or nearer) is as close to one as you like. \end{frame} \begin{frame} \frametitle{Picture it} %\framesubtitle{} {\small \begin{eqnarray*} P\{|T_n-t|<\epsilon\} \pause & = & P\{-\epsilon < T_n-\theta < \epsilon\} \\ \pause & = & P\{\theta-\epsilon < T_n < \theta+\epsilon\} \pause \end{eqnarray*} } % End size \begin{center} \includegraphics[width=2.5in]{convergence1} \end{center} \end{frame} \begin{frame} \frametitle{Picture it} % Superimpose the second layer, another density %\framesubtitle{} {\small \begin{eqnarray*} P\{|T_n-t|<\epsilon\} & = & P\{-\epsilon < T_n-\theta < \epsilon\} \\ & = & P\{\theta-\epsilon < T_n < \theta+\epsilon\} \end{eqnarray*} } % End size \begin{center} \includegraphics[width=2.5in]{convergence2} \end{center} \end{frame} \begin{frame} \frametitle{Convergence in Probability for Random Vectors} %\framesubtitle{} Let $\mathbf{T}_1, \mathbf{T}_2, \ldots$ be a sequence of $k$-dimensional random vectors. \pause We say that $\mathbf{T}_n$ converges in probability to $\boldsymbol{\theta} \in \mathbb{R}^k$, and write $\mathbf{T}_n \stackrel{p}{\rightarrow} \boldsymbol{\theta}$ if for all $\epsilon>0$, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{||\mathbf{T}_n-\boldsymbol{\theta}||<\epsilon \}=1, \end{displaymath} } \pause where $||\mathbf{a}-\mathbf{b}||$ denotes Euclidian distance in $\mathbb{R}^k$. \end{frame} \begin{frame} \frametitle{Use theorems, not the definition} %\framesubtitle{} \begin{itemize} \item In this class we will \emph{not} use the definition of convergence in probability. \pause \item We will use theorems instead. \end{itemize} \end{frame} \begin{frame} \frametitle{The Law of Large Numbers} Let $X_1, X_2, \ldots$ be independent random variables from a distribution with expected value $\mu$. \pause The Law of Large Numbers says \pause {\huge \begin{displaymath} \overline{X}_n \stackrel{p}{\rightarrow} \mu \end{displaymath} } \end{frame} \begin{frame} \frametitle{The Change of Variables formula: Let $Y = g(X)$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} E(Y) = \int_{-\infty}^\infty y \, f_{_Y}(y) \, dy = \int_{-\infty}^\infty g(x) \, f_{_X}(x) \, dx \end{displaymath} } \pause Or, for discrete random variables \pause {\LARGE \begin{displaymath} E(Y) = \sum_y y \, p_{_Y}(y) = \sum_x g(x) \, p_{_X}(x) \end{displaymath} } \pause This is actually a big theorem, not a definition. \end{frame} \begin{frame} \frametitle{Applying the change of variables formula} \framesubtitle{To approximate $E[g(X)]$} \pause {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n g(X_i) &=& \frac{1}{n}\sum_{i=1}^n Y_i \pause \stackrel{p}{\rightarrow} E(Y) \\ \\ \pause &=& E(g(X)) \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{So for example} %\framesubtitle{} {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n X_i^k &\stackrel{p}{\rightarrow}& E(X^k) \\ &&\\ \pause \frac{1}{n}\sum_{i=1}^n U_i^2 V_i W_i^3 &\stackrel{p}{\rightarrow}& E(U^2VW^3) \end{eqnarray*} }\pause \vspace{2mm} \begin{itemize} \item That is, sample moments converge in probability to population moments. \pause \item Central sample moments converge to central population moments as well. \end{itemize} \end{frame} \begin{frame} \frametitle{Two more Theorems} \pause %\framesubtitle{} \begin{itemize} \item The ``stack" theorem and continuous mapping. \pause \item Often used together. \end{itemize} \end{frame} \begin{frame} \frametitle{The ``Stack" Theorem} \framesubtitle{Because I don't know what to call it.} Let $\mathbf{X}_n \stackrel{p}{\rightarrow} \mathbf{x}$ and $\mathbf{Y}_n \stackrel{p}{\rightarrow} \mathbf{y}$. Then the partitioned random vector %{\LARGE \begin{displaymath} \left( \begin{array}{cc} \mathbf{X}_n \\ \mathbf{Y}_n \end{array} \right) \stackrel{p}{\rightarrow} \left( \begin{array}{cc} \mathbf{x} \\ \mathbf{y} \end{array} \right) \end{displaymath} %} % End size \end{frame} \begin{frame} \frametitle{Continuous mapping} \framesubtitle{One of the Slutsky lemmas} \pause Let $\mathbf{T}_n \stackrel{p}{\rightarrow} \mathbf{t}$, \pause and let the function $g(\mathbf{x})$ be continuous at $\mathbf{x}=\mathbf{t}$. Then \pause {\LARGE \begin{displaymath} g(\mathbf{T}_n) \stackrel{p}{\rightarrow} g(\mathbf{t}) \end{displaymath} }% End size \pause \vspace{5mm} Note that the function $g$ could be multidimensional, for example mapping $\mathbb{R}^5$ into $\mathbb{R}^2$. \end{frame} \begin{frame} \frametitle{Definition of Consistency} \pause %\framesubtitle{} The random vector (of statistics) $\mathbf{T}_n$ is said to be a \emph{consistent} estimator of the parameter vector $\boldsymbol{\theta}$ if \pause {\LARGE \begin{displaymath} \mathbf{T}_n \stackrel{p}{\rightarrow} \boldsymbol{\theta} \end{displaymath} \pause } % End size for all $\boldsymbol{\theta} \in \Theta$. \end{frame} \begin{frame} \frametitle{Consistency of the Sample Variance } \framesubtitle{This answer gets full marks.}\pause {\small \begin{displaymath} \widehat{\sigma}^2_n = \frac{1}{n}\sum_{i=1}^n (X_i-\overline{X})^2 \pause = \frac{1}{n}\sum_{i=1}^n X_i^2 - \overline{X}^2 \end{displaymath} \pause \vspace{3mm} By LLN, $\overline{X}_n \stackrel{p}{\rightarrow}\mu$ \pause and $\frac{1}{n}\sum_{i=1}^n X_i^2 \stackrel{p}{\rightarrow} E(X^2) \pause = \sigma^2+\mu^2$. \pause \vspace{3mm} By continuous mapping, \pause \begin{displaymath} \widehat{\sigma}^2_n = \frac{1}{n}\sum_{i=1}^n X_i^2 - \overline{X}^2 \pause \stackrel{p}{\rightarrow} \pause \sigma^2+\mu^2 - \mu^2 = \sigma^2 \end{displaymath} \pause \vspace{3mm} Note the silent use of the Stack Theorem. } % End size \end{frame} \begin{frame} \frametitle{Method of Moments Estimators are Consistent} \framesubtitle{For most practical cases} \pause Recall \begin{itemize} \item Let $m$ denote a vector of population moments. \pause \item $\widehat{m}$ is the corresponding vector of sample moments. \pause \item Find $m = g(\theta)$ \pause \item Solve for $\theta$, obtaining $\theta= g^{-1}(m)$. \pause \item Let $\widehat{\theta}_n = g^{-1}(\widehat{m}_n)$. \pause \end{itemize} \vspace{3mm} If $g$ is continuous, so is $g^{-1}$. \pause Then by continous mapping, $\widehat{m} \stackrel{p}{\rightarrow} m \pause \Rightarrow \widehat{\theta}_n = g^{-1}(\widehat{m}_n) \stackrel{p}{\rightarrow} \pause g^{-1}(m) = \pause \theta$. \end{frame} \begin{frame} \frametitle{Consistency is great but it's not enough.} \begin{itemize} \item It's the least we can ask. Estimators that are \emph{not} consistent are completely unacceptable for most purposes. \pause \item Think of $a_n = 1/n$ as a sequence of degenerate random variables with $P\{a_n = 1/n\}=1$. \pause \item So, $a_n \stackrel{p}{\rightarrow} 0$. \pause \end{itemize} \vspace{5mm} {\LARGE \begin{displaymath} T_n \stackrel{p}{\rightarrow} \theta \pause \Rightarrow \pause U_n = T_n + \frac{100,000,000}{n} \pause \stackrel{p}{\rightarrow} \theta. \end{displaymath} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s17} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} % \stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}_i} # Picture of convergence in probability, using R # Layer 1 rm(list=ls()) x = seq(from=-3,to=3,by=0.01) sigma=1/2; raise=0.25; z=1.3 Density = dnorm(x,mean=0,sd=sigma)+raise plot(x,Density,axes=F,xlab='',ylab='',pch=' ') lines(x,dnorm(x)+raise,lty=2) lines(c(-3,3),c(raise,raise),lty=1) text(x=0,y=raise-0.015,expression(theta)) text(x=-z+.20,y=raise-0.015,expression(paste("(",theta-epsilon))) text(x= z-.20,y=raise-0.015,expression(paste(theta+epsilon,")"))) lines(c(-z,-z),c(raise,raise+dnorm(-z)),lty=2) lines(c(z,z),c(raise,raise+dnorm(z)),lty=2) text(x=0,y=dnorm(0,sd=sigma)+raise,".") # Guide for cropping # Save this picture as convergence1.pdf, and proceed ######################################################################### # Layer 2 lines(x,Density,lty=1) lines(c(-z,-z),c(raise,raise+dnorm(-z,sd=sigma)),lty=1) lines(c(z,z),c(raise,raise+dnorm(z,sd=sigma)),lty=1) # Save as convergence2.pdf # Old # Picture of convergence in probability, using R rm(list=ls()) x = seq(from=-3,to=3,by=0.01) sigma=1/2; raise=0.25; z=1.3 Density = dnorm(x,mean=0,sd=sigma)+raise plot(x,Density,type='l',axes=F,xlab='',ylab='') lines(c(-3,3),c(raise,raise),lty=1) text(x=0,y=raise-0.015,expression(theta)) lines(c(-z,-z),c(raise,raise+dnorm(-z,sd=sigma)),lty=1) text(x=-z+.20,y=raise-0.015,expression(paste("(",theta-epsilon))) lines(c(z,z),c(raise,raise+dnorm(z,sd=sigma)),lty=1) text(x= z-.20,y=raise-0.015,expression(paste(theta+epsilon,")")))