% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Statistical models and estimation\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2015} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Models} \begin{frame} \frametitle{Statistical model} \framesubtitle{Most good statistical analyses are based on a \emph{model} for the data.} \pause A \emph{statistical model} is a set of assertions that partly specify the probability distribution of the observable data. The specification may be direct or indirect. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where\pause \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \end{itemize} \pause A model is not the same thing as the \emph{truth}. \end{frame} \begin{frame} \frametitle{Statistical models leave something unknown} \framesubtitle{Otherwise they are probability models} \pause \begin{itemize} \item The unknown part of the model for the data is called the \emph{parameter}. \pause \item Usually, parameters are (vectors of) numbers. \pause \item Usually denoted by $\theta$ or $\boldsymbol{\theta}$ or other Greek letters. \pause \item Parameters are unknown constants. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause The parameter space is $\Theta = \{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \pause The parameter space is $\Theta = \{(\beta_0, \ldots, \beta_k, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameters need not be numbers} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a continuous distribution with unknown distribution function $F(x)$. \pause \begin{itemize} \item The parameter is the unknown distribution function $F(x)$. \pause \item The parameter space is a space of distribution functions. \pause \item We may be interested only in a \emph{function} of the parameter, like \pause \end{itemize} \begin{displaymath} \mu = \int_{-\infty}^\infty x f(x) \, dx \end{displaymath} \pause \vspace{3mm} The rest of $F(x)$ is just a nuisance parameter. \end{frame} \begin{frame} \frametitle{General statement of a statistical model} \framesubtitle{$D$ is for Data} {\LARGE \begin{displaymath} D \sim P_\theta, ~~~ \theta \in \Theta \end{displaymath} } % End size \vspace{3mm} \pause \begin{itemize} \item Both $D$ and $\theta$ could be vectors \pause \item For example, \begin{itemize} \item $D = \mathbf{Y}_1, \ldots \mathbf{Y}_n$ independent multivariate normal. \pause \item $\theta = (\boldsymbol{\mu,\Sigma})$. \pause \item $P_\theta$ is the joint distribution function of $\mathbf{Y}_1, \ldots \mathbf{Y}_n$, with joint density \pause \end{itemize} \end{itemize} \begin{displaymath} f(\mathbf{y}_1, \ldots \mathbf{y}_n) = \prod_{i=1}^n f(\mathbf{y}_i;\boldsymbol{\mu,\Sigma}) \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimation} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item We don't know $\theta$. \pause \item We never know $\theta$. \pause \item All we can do is guess. \pause \item Estimate $\theta$ (or a function of $\theta$) based on the observable data. \pause \item $T$ is an \emph{estimator} of $\theta$ (or a function of $\theta$): $T=T(D)$ \pause \end{itemize} For example, \begin{itemize} \item $D = X_1, \ldots, X_n \stackrel{i.i.d}{\sim} N(\mu,\sigma^2)$, $T = (\overline{X},S^2)$. \pause \item For an ordinary multiple regression model, $T=(\widehat{\boldsymbol{\beta}},MSE)$ \pause \end{itemize} $T$ is a \emph{statistic}, a random variable (vector) that can be computed from the data without knowing the values of any unknown parameters. \end{frame} \section{MOM} \begin{frame} \frametitle{Parameter estimation} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item Estimate $\theta$ with $T=T(D)$. \pause \item How do we get a recipe for $T$? \pause \item It's good to be systematic. Lots of methods are available. \pause \item We will consider two: Method of moments and maximum likelihood. \end{itemize} \end{frame} \begin{frame} \frametitle{Moments} \framesubtitle{Based on a random sample like $(X_1,Y_1), \ldots, (X_n,Y_n)$} \pause \begin{itemize} \item Moments are quantities like $E\{X_i\}$, $E\{X_i^2\}$, $E\{X_iY_i\}$, $E\{W_iX_i^2Y_i^3\}$, etc. \pause \item \emph{Central} moments are moments of \emph{centered} random variables: \pause \begin{itemize} \item[] $E\{(X_i-\mu_x)^2\}$ \pause \item[] $E\{(X_i-\mu_x)(Y_i-\mu_y)\}$ \pause \item[] $E\{(X_i-\mu_x)^2(Y_i-\mu_y)^3(Z_i-\mu_z)^2\}$ \pause \end{itemize} \item These are all \emph{population} moments. \end{itemize} \end{frame} \begin{frame} \frametitle{Population moments and sample moments} % \framesubtitle{Assume $X_i$ and $Y_i$ values can be observed.} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{ll} \hline Population moment & Sample moment \\ \hline $E\{X_i\}$ & $\frac{1}{n}\sum_{i=1}^n X_i$ \\ \pause $E\{X_i^2\}$ & $\frac{1}{n}\sum_{i=1}^n X_i^2$ \\ \pause $E\{X_iY_i\}$ & $\frac{1}{n}\sum_{i=1}^n X_iY_i$ \\ \pause $E\{(X_i-\mu_x)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)^2$ \\ \pause $E\{(X_i-\mu_x)(Y_i-\mu_y)\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)(Y_i-\overline{Y}_n)$ \\ \pause $E\{(X_i-\mu_x)(Y_i-\mu_y)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (X_i-\overline{X}_n)(Y_i-\overline{Y}_n)^2$ \\ \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \end{frame} \begin{frame} \frametitle{Estimation by the Method of Moments (MOM)} \framesubtitle{For the model $D \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \pause \item Population moments are a function of $\theta$. \pause \item Find $\theta$ as a function of the population moments. \pause \item Estimate $\theta$ with that function of the \emph{sample} moments. \pause \end{itemize} Symbolically, \pause \begin{itemize} \item Let $m$ denote a vector of population moments. \pause \item $\widehat{m}$ is the corresponding vector of sample moments. \pause \item Find $m = g(\theta)$ \pause \item Solve for $\theta$, obtaining $\theta= g^{-1}(m)$. \pause \item Let $\widehat{\theta} = g^{-1}(\widehat{m})$. \pause \end{itemize} It doesn't matter if you solve first or put hats on first. \end{frame} \begin{frame} \frametitle{Example: $X_1, \ldots, X_n \stackrel{i.i.d}{\sim} U(0,\theta)$} \framesubtitle{$f(x) = \frac{1}{\theta}$ for $0 ybar = 60/100; ybar [1] 0.6 \end{verbatim} %\verb:f(x): \end{frame} \begin{frame} \frametitle{Maximum likelihood for the univariate normal} %\framesubtitle{} \pause Let $X_1, \ldots, X_n \stackrel{i.i.d}{\sim} N(\mu,\sigma^2)$. \pause \vspace{5mm} \begin{eqnarray*} \ell(\theta) & = & \ln \prod_{i=1}^n \frac{1}{\sigma\sqrt{2\pi}} e^{-\frac{1}{2}\frac{(x_i-\mu)^2}{\sigma^2}}\\ \pause & = & \ln\left(\sigma^{-n}(2\pi)^{-\frac{n}{2}} e^{-\frac{1}{2\sigma^2} \sum_{i=1}^n(x_i-\mu)^2}\right) \\ \pause & = & -n\ln\sigma - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Differentiate with respect to the parameters} \framesubtitle{$\ell(\theta) = -n\ln\sigma - \frac{n}{2}\ln (2\pi) - \frac{1}{2\sigma^2}\sum_{i=1}^n(x_i-\mu)^2$} \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\mu} & = & - \frac{1}{2\sigma^2}\sum_{i=1}^n2(x_i-\mu)(-1) \pause \stackrel{set}{=} 0 \\ \pause & \Rightarrow & \mu= \overline{x} \\ \pause &&\\ \frac{\partial\ell}{\partial\sigma} & = & \pause - \frac{n}{\sigma} - \frac{1}{2} \sum_{i=1}^n(x_i-\mu)^2 (-2\sigma^{-3}) \\ \pause & = & - \frac{n}{\sigma} + \frac{1}{\sigma^3} \sum_{i=1}^n(x_i-\mu)^2 \pause ~~ \stackrel{set}{=} ~~ 0 \\ \pause & \Rightarrow & \sigma^2 = \frac{1}{n}\sum_{i=1}^n(x_i-\mu)^2 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Substituting} %\framesubtitle{} Setting derivaties to zero, we have obtained \pause \begin{displaymath} \mu = \overline{x} \mbox{ and } \sigma^2 = \frac{1}{n}\sum_{i=1}^n(x_i-\mu)^2, \mbox{ so} \end{displaymath} \pause {\LARGE \begin{eqnarray*} \widehat{\mu} & = & \overline{X} \\ \pause \widehat{\sigma}^2 & = & \frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2 \\ \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Gamma Example} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a Gamma distribution with parameters $\alpha>0$ and $\beta>0$ \pause {\LARGE \begin{eqnarray*} f(x;\alpha,\beta) & = & \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1} \\ \pause && \\ \Theta & = & \{(\alpha,\beta): \alpha>0, \beta>0 \} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Log Likelihood} \framesubtitle{$f(x;\alpha,\beta) = \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1}$} \begin{eqnarray*} \ell(\alpha,\beta) &=& \ln \prod_{i=1}^n \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x_i/\beta} x_i^{\alpha - 1} \nonumber \\ \pause &=& \ln \left( \beta^{-n\alpha} \, \Gamma(\alpha)^{-n} \exp(-\frac{1}{\beta}\sum_{i=1}^n x_i) \left(\prod_{i=1}^n x_i \right)^{\alpha-1} \right) \\ \pause &=& -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Differentiate with respect to the parameters} \framesubtitle{$\ell(\theta) = -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i$} \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\beta} & \stackrel{set}{=} & 0~~ \pause \Rightarrow ~~ \alpha\beta = \overline{x} \\ \pause &&\\ \frac{\partial\ell}{\partial\alpha} & = & -n\ln\beta -n \frac{\partial}{\partial\alpha} \ln \Gamma(\alpha) + \sum_{i=1}^n \ln x_i \\ \pause & = & \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} \pause ~~\stackrel{set}{=}~~0 \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Solve for $\alpha$} %\framesubtitle{} {\LARGE \begin{displaymath} \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} = 0 \end{displaymath} \pause } % End size \vspace{8mm} where \begin{displaymath} \Gamma(\alpha) = \int_0^\infty e^{-t}t^{\alpha-1} \, dt. \end{displaymath} \pause \vspace{8mm} Nobody can do it. \end{frame} \begin{frame} \frametitle{Maximize the likelihood numerically with software} \framesubtitle{Usually this is in high dimension} \begin{center} \includegraphics[width=4in]{Likelihood} \end{center} \pause \begin{itemize} \item It's like trying to find the top of a mountain by walking uphill blindfolded. \pause \item You might stop at a local maximum. \pause \item The starting place is very important. \pause \item The final answer is a number (or vector of numbers). \pause \item There is no explicit formula for the MLE. \end{itemize} \end{frame} \begin{frame} \frametitle{There is a lot of useful theory} \framesubtitle{Even without an explicit formula for the MLE} \begin{center} \includegraphics[width=3in]{Likelihood} \end{center} \pause \begin{itemize} \item MLE is asymptotically normal. \pause \item Variance of the MLE is deeply related to the curvature of the log likelihood at the MLE. \pause \item The more curvature, the smaller the variance. \pause \item The variance of the MLE can be estimated from the curvature (using the Fisher Information). \pause \item Basis of tests and confidence intervals. \end{itemize} \end{frame} \begin{frame} \frametitle{Comparing MOM and MLE} \pause %\framesubtitle{} \begin{itemize} \item Sometimes they are identical, sometimes not. \pause \item If the model is right they are usually close for large samples. \pause \item Both are asymptotically normal. \pause \item Estimates of the variance are well known for both. \pause \item Small variance of an estimator is good. \pause \item As $n \rightarrow \infty$, nothing can beat the MLE. \pause \item Except that the MLE depends on a very specific distribution. \pause \item And sometimes the dependence matters. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s15} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s15}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} % \stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}_i}