% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} \usepackage{alltt} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Statistical models and estimation\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Models} \begin{frame} \frametitle{Statistical model} \framesubtitle{Most good statistical analyses are based on a \emph{model} for the data.} \pause A \emph{statistical model} is a set of assertions that partly specify the probability distribution of the observable data. The specification may be direct or indirect. \pause \begin{itemize} \item Let $x_1, \ldots, x_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \item For $i=1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables, not observable. \item[] $\sigma^2$ is an unknown constant. \item[] $y_1, \ldots, y_n$ are observable random variables. \end{itemize} \end{itemize} \pause A model is not the same thing as the \emph{truth}. \end{frame} \begin{frame} \frametitle{Statistical models leave something unknown} \framesubtitle{Otherwise they are probability models} \begin{itemize} \item The unknown part of the model is called the \emph{parameter}. \pause \item Usually, parameters are (vectors of) numbers. \item Usually denoted by $\theta$ or $\boldsymbol{\theta}$ or other Greek letters. \item In the non-Bayesian world, parameters are unknown constants. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $x_1, \ldots, x_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. The parameter space is $\Theta = \{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $y_1, \ldots, y_n$ are observable random variables. \end{itemize} The parameter space is $\Theta = \{(\beta_0, \ldots, \beta_k, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Parameters need not be numbers} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a continuous distribution with unknown distribution function $F(x)$. \begin{itemize} \item The parameter is the unknown distribution function $F(x)$. \pause \item The parameter space is a space of distribution functions. \pause \item We may be interested only in a \emph{function} of the parameter, like \end{itemize} \begin{displaymath} \mu = \int_{-\infty}^\infty x f(x) \, dx \end{displaymath} \vspace{3mm} The rest of $F(x)$ is just a nuisance parameter. \end{frame} \begin{frame} \frametitle{General statement of a statistical model} \framesubtitle{$d$ is for Data} {\LARGE \begin{displaymath} d \sim P_\theta, ~~~ \theta \in \Theta \end{displaymath} } % End size \vspace{3mm} \pause \begin{itemize} \item Both $d$ and $\theta$ could be vectors \item For example, \begin{itemize} \item $d = (\mathbf{y}_1, \ldots \mathbf{y}_n)$ independent multivariate normal. \item $\theta = (\boldsymbol{\mu,\Sigma})$. \pause \item $P_\theta$ is the joint distribution function of $\mathbf{y}_1, \ldots \mathbf{y}_n$, with joint density \end{itemize} \end{itemize} \begin{displaymath} f(\mathbf{y}_1, \ldots \mathbf{y}_n) = \prod_{i=1}^n f(\mathbf{y}_i;\boldsymbol{\mu,\Sigma}) \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimation} \framesubtitle{For the model $d \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item We don't know $\theta$. \item We never know $\theta$. \item All we can do is guess. \pause \item Estimate $\theta$ (or a function of $\theta$) based on the observable data. \pause \item $t$ is an \emph{estimator} of $\theta$ (or a function of $\theta$): $t=t(d)$ \item $t$ is a \emph{statistic}, a random variable (vector) that can be computed from the data without knowing the values of any unknown parameters. \pause \end{itemize} For example, \begin{itemize} \item $d = x_1, \ldots, x_n \stackrel{i.i.d}{\sim} N(\mu,\sigma^2)$ ~~~~~~~~~~~~~~~~~~ $t = (\overline{x},S^2)$. \pause % Stupid visual layout \item For an ordinary multiple regression model, $t=(\widehat{\boldsymbol{\beta}},MSE)$ \end{itemize} \end{frame} \section{MOM} \begin{frame} \frametitle{Parameter estimation} \framesubtitle{For the model $d \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item Estimate $\theta$ with $t=t(d)$. \item How do we get a recipe for $t$? Guess? \pause \item It's good to be systematic. Lots of methods are available. \item We will consider two: Method of moments and maximum likelihood. \end{itemize} \end{frame} \begin{frame} \frametitle{Moments} \framesubtitle{Based on a random sample like $(x_1,y_1), \ldots, (x_n,y_n)$} \pause \begin{itemize} \item Moments are quantities like $E\{x_i\}$, $E\{x_i^2\}$, $E\{x_iy_i\}$, $E\{W_ix_i^2y_i^3\}$, etc. \pause \item \emph{Central} moments are moments of \emph{centered} random variables: \begin{itemize} \item[] $E\{(x_i-\mu_x)^2\}$ \item[] $E\{(x_i-\mu_x)(y_i-\mu_y)\}$ \item[] $E\{(x_i-\mu_x)^2(y_i-\mu_y)^3(Z_i-\mu_z)^2\}$ \end{itemize} \item These are all \emph{population} moments. \end{itemize} \end{frame} \begin{frame} \frametitle{Population moments and sample moments} % \framesubtitle{Assume $x_i$ and $y_i$ values can be observed.} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{ll} \hline Population moment & Sample moment \\ \hline $E\{x_i\}$ & $\frac{1}{n}\sum_{i=1}^n x_i$ \\ $E\{x_i^2\}$ & $\frac{1}{n}\sum_{i=1}^n x_i^2$ \\ $E\{x_iy_i\}$ & $\frac{1}{n}\sum_{i=1}^n x_iy_i$ \\ $E\{(x_i-\mu_x)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)^2$ \\ $E\{(x_i-\mu_x)(y_i-\mu_y)\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)(y_i-\overline{y}_n)$ \\ $E\{(x_i-\mu_x)(y_i-\mu_y)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)(y_i-\overline{y}_n)^2$ \\ \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \end{frame} \begin{frame} \frametitle{Estimation by the Method of Moments (MOM)} \framesubtitle{For the model $d \sim P_\theta, ~~~ \theta \in \Theta$} \begin{itemize} \item Population moments are a function of $\theta$. \item Find $\theta$ as a function of the population moments. \item Estimate $\theta$ with that function of the \emph{sample} moments. \pause \end{itemize} Symbolically, \begin{itemize} \item Let $m$ denote a vector of population moments. \item $\widehat{m}$ is the corresponding vector of sample moments. \item Find $m = g(\theta)$ \item Solve for $\theta$, obtaining $\theta= g^{-1}(m)$. \item Let $\widehat{\theta} = g^{-1}(\widehat{m})$. \pause \end{itemize} It doesn't matter if you solve first or put hats on first. \end{frame} \begin{frame} \frametitle{Example: $x_1, \ldots, x_n \stackrel{i.i.d}{\sim} U(0,\theta)$} \framesubtitle{$f(x) = \frac{1}{\theta}$ for $00$ and $\beta>0$ {\LARGE \begin{eqnarray*} f(x;\alpha,\beta) & = & \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1} \\ && \\ \Theta & = & \{(\alpha,\beta): \alpha>0, \beta>0 \} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Log Likelihood} \framesubtitle{$f(x;\alpha,\beta) = \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1}$} \begin{eqnarray*} \ell(\alpha,\beta) &=& \ln \prod_{i=1}^n \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x_i/\beta} x_i^{\alpha - 1} \nonumber \\ \pause &=& \ln \left( \beta^{-n\alpha} \, \Gamma(\alpha)^{-n} \exp(-\frac{1}{\beta}\sum_{i=1}^n x_i) \left(\prod_{i=1}^n x_i \right)^{\alpha-1} \right) \\ \pause &=& -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Differentiate with respect to the parameters} \framesubtitle{$\ell(\theta) = -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i$} \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\beta} & \stackrel{set}{=} & 0~~ \pause \Rightarrow ~~ \alpha\beta = \overline{x} \\ \pause &&\\ \frac{\partial\ell}{\partial\alpha} & = & -n\ln\beta -n \frac{\partial}{\partial\alpha} \ln \Gamma(\alpha) + \sum_{i=1}^n \ln x_i \\ \pause & = & \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} \pause ~~\stackrel{set}{=}~~0 \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Solve for $\alpha$} %\framesubtitle{} {\LARGE \begin{displaymath} \sum_{i=1}^n \ln x_i -n\ln\beta - n\frac{\Gamma^\prime(\alpha)}{\Gamma(\alpha)} = 0 \end{displaymath} } % End size \vspace{8mm} where \begin{displaymath} \Gamma(\alpha) = \int_0^\infty e^{-t}t^{\alpha-1} \, dt. \end{displaymath} \pause \vspace{8mm} Nobody can do it. \end{frame} \begin{frame} \frametitle{Maximize the likelihood numerically with software} \framesubtitle{Usually this is in high dimension} \begin{center} \includegraphics[width=4in]{Likelihood} \end{center} \pause \begin{itemize} \item It's like trying to find the top of a mountain by walking uphill blindfolded. \item You might stop at a local maximum. \item The starting place is very important. \item The final answer is a number (or vector of numbers). \item There is no explicit formula for the MLE. \end{itemize} \end{frame} \begin{frame} \frametitle{There is a lot of useful theory} \framesubtitle{Even without an explicit formula for the MLE} \begin{center} \includegraphics[width=4in]{Likelihood} \end{center} \pause {\footnotesize \begin{itemize} \item MLE is asymptotically normal. \pause \item Variance of the MLE is deeply related to the curvature of the log likelihood at the MLE. \item The more curvature, the smaller the variance. \pause \item The variance of the MLE can be estimated from the curvature (using the Fisher Information). \item Basis of tests and confidence intervals. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Comparing MOM and MLE} \pause %\framesubtitle{} \begin{itemize} \item Sometimes they are identical, sometimes not. \item If the model is right they are usually close for large samples. \item Both are asymptotically normal, cenered around the true parameter value(s). \item Estimates of the variance are easy to obtain for both. \item Small variance of an estimator is good. \pause \item As $n \rightarrow \infty$, nothing can beat the MLE. \item Except that the MLE depends on a very specific distribution. \item And sometimes the dependence matters. \item In such cases, MOM may be preferable. \end{itemize} \end{frame} \begin{frame} \frametitle{Gamma Example} %\framesubtitle{} %{\LARGE \begin{eqnarray*} f(x;\alpha,\beta) & = & \frac{1}{\beta^\alpha \Gamma(\alpha)} e^{-x/\beta} x^{\alpha - 1} \\ \ell(\alpha,\beta) &=& -n\alpha\ln\beta -n\ln\Gamma(\alpha) - \frac{1}{\beta}\sum_{i=1}^n x_i + (\alpha - 1) \sum_{i=1}^n \ln x_i \end{eqnarray*} %} % End size \end{frame} \begin{frame}[fragile] \frametitle{R function for the minus log likelihood} %\framesubtitle{} {\footnotesize % or scriptsize \begin{columns} % Use more of the margins \column{1.1\textwidth} \begin{verbatim} gmll = function(theta,datta) { aa = theta[1]; bb = theta[2] nn = length(datta); sumd = sum(datta) sumlogd = sum(log(datta)) value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd return(value) } # End function gmll \end{verbatim} \end{columns} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Simulated Data} \framesubtitle{$n=50$, True $\alpha=2$, $\beta=3$} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> d } [1] 20.87 13.74 5.13 2.76 4.73 2.66 11.74 0.75 22.07 10.49 7.26 5.82 [13] 13.08 1.79 4.57 1.40 1.13 6.84 3.21 0.38 11.24 1.72 4.69 1.96 [25] 7.87 8.49 5.31 3.40 5.24 1.64 7.17 9.60 6.97 10.87 5.23 5.53 [37] 15.80 6.40 11.25 4.91 12.05 5.44 12.62 1.81 2.70 3.03 4.09 12.29 [49] 3.23 10.94 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Where should the numerical search start?} %\framesubtitle{} \begin{itemize} \item How about Method of Moments estimates? \item $E(x) = \alpha\beta$, $Var(x) = \alpha\beta^2$ \item Solve for $\alpha$ and $\beta$, replace population moments by sample moments and put a $\sim$ above the parameters. \end{itemize} \begin{displaymath} \stackrel{\sim}{\alpha} = \frac{\overline{x}^2}{s^2} \mbox{~~~and~~~} \stackrel{\sim}{\beta} =\frac{s^2}{\overline{x}} \end{displaymath} \pause {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # MOM for starting values > momalpha = mean(d)^2/var(d); momalpha } [1] 1.899754 {\color{blue}> mombeta = var(d)/mean(d); mombeta } [1] 3.620574 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Numerical search using the \texttt{optim} function} %\framesubtitle{U} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # Error message says: "Bounds can only be used with method > # L-BFGS-B (or Brent)" > gsearch = optim(par=c(momalpha,mombeta), fn = gmll, + method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d) } \end{alltt} } % End size \end{frame} \begin{frame}[fragile] % alltt is usually better, but not this time \frametitle{ } %\framesubtitle{} {\scriptsize {\color{blue} \begin{verbatim} > gsearch \end{verbatim} } % End color \begin{verbatim} $par [1] 1.805930 3.808674 $value [1] 142.0316 $counts function gradient 9 9 $convergence [1] 0 $message [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" $hessian [,1] [,2] [1,] 36.69402 13.127928 [2,] 13.12793 6.224773 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Meaning of the output} %\framesubtitle{} %\renewcommand{\arraystretch}{1.5} \begin{tabular}{lc} Output & Meaning \\ \hline &\\ % \raisebox{.55in}{ \begin{minipage}{2in} \begin{verbatim} $par [1] 1.805930 3.808674 \end{verbatim} \end{minipage} % } % End raisebox & $\widehat{\boldsymbol{\theta}}$ % Maybe a strut \\ &\\ \begin{minipage}{2in} \begin{verbatim} $value [1] 142.0316 \end{verbatim} \end{minipage} % } % End raisebox & $-\ell(\widehat{\boldsymbol{\theta}})$ \\ &\\ \begin{minipage}{2in} \begin{verbatim} $hessian [,1] [,2] [1,] 36.69402 13.127928 [2,] 13.12793 6.224773 \end{verbatim} \end{minipage} % } % End raisebox & $\mathbf{H} = \left[\frac{\partial^2 (-\ell)} {\partial\theta_i\partial\theta_j}\right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}}$ \end{tabular} %\renewcommand{\arraystretch}{1.0} \end{frame} \begin{frame} \frametitle{Second Derivative test} \framesubtitle{$\mathbf{H} = \left[\frac{\partial^2 (-\ell)} {\partial\theta_i\partial\theta_j}\right]$} \begin{itemize} \item If the second derivatives are continuous, $\mathbf{H}$ is symmetric. \item If the gradient is zero at a point and $|\mathbf{H}| \neq 0$, \pause \begin{itemize} \item If $\mathbf{H}$ is positive definite, local minimum \item If $\mathbf{H}$ is negative definite, local maximum \item If $\mathbf{H}$ has both positive and negative eigenvalues, saddle point \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{MLE} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> thetahat = gsearch$par > names(thetahat) = c("alpha-hat","beta-hat"); thetahat } alpha-hat beta-hat 1.805930 3.808674 {\color{blue}> # Second derivative test > eigen(gsearch$hessian)$values } [1] 41.569998 1.348796 $ \end{alltt} } % End size \end{frame} \section{Invariance} \begin{frame} \frametitle{The Invariance principle of maximum likelihood estimation} %\framesubtitle{Also applies to Method of Moments estimation} \pause \begin{itemize} \item The Invariance Principle of maximum likelihood estimation says that \emph{the MLE of a function is that function of the MLE.}\footnote{Provided the function is one-to-one.} % \item An example comes first, followed by formal details. \end{itemize} \end{frame} \begin{frame} \frametitle{Example of the invariance principle} % \framesubtitle{Of } Let $d_1, \ldots, d_n$ be a random sample from a Bernoulli distribution (1=Yes, 0=No) with parameter $\theta, 0<\theta<1$. \begin{itemize} \item The parameter space is $\Theta = (0,1)$ \item MLE is $\widehat{\theta} = \overline{d}$, the sample proportion. \pause \item Write the model in terms of the \emph{odds} of $d_i=1$, a re-parameterization that is often useful in categorical data analysis. \item Denote the odds by $\theta^\prime = \frac{\theta}{1-\theta}$. \pause \item $ \theta^\prime = \frac{\theta}{1-\theta} \iff \theta = \frac{\theta^\prime}{1+\theta^\prime}$. \item As $\theta$ ranges from zero to one, $\theta^\prime$ ranges from zero to infinity. \item So there is a new parameter space: $\theta^\prime \in \Theta^\prime = (0,\infty)$. \end{itemize} \end{frame} \begin{frame} \frametitle{MLE of the odds} %\framesubtitle{} \begin{itemize} \item $ \theta^\prime = \frac{\theta}{1-\theta} \iff \theta = \frac{\theta^\prime}{1+\theta^\prime}$ \item[] \item Because the re-parameterization is one-to-one, $\widehat{\theta}^\prime = \frac{\overline{d}}{1-\overline{d}}$ without any calculation. \end{itemize} \end{frame} % There was a longer version in 2017 \begin{frame} \frametitle{Theorem} \framesubtitle{See text for a proof. The one-to-one part is critical.} \pause Let $g: \Theta \rightarrow \Theta^\prime$ be a one-to-one re-parameterization, with the maximum likelihood estimate $\widehat{\theta}$ satisfying $L(\widehat{\theta}) > L(\theta)$ for all $\theta \in \Theta$ with $\theta \neq \widehat{\theta}$. Then $L^\prime(g(\widehat{\theta})) > L^\prime(\theta^\prime)$ for all $\theta^\prime \in \Theta^\prime$ with $\theta^\prime \neq g(\widehat{\theta})$. \pause \vspace{3mm} In other words \begin{itemize} \item The MLE of $g(\theta)$ is $g(\widehat{\theta})$. \item $\widehat{g(\theta)} = g(\widehat{\theta})$. \item The MLE of $\theta^\prime$ is $g(\widehat{\theta})$. \item $\widehat{\theta}^\prime = g(\widehat{\theta})$. \end{itemize} \end{frame} \begin{frame} \frametitle{Re-parameterization in general} %\framesubtitle{} The parameters of common statistical models are written in a standard way, % based on on meaningfulness, convenience and tradition, but other equivalent parameterizations are sometimes useful. \pause \vspace{2mm} Suppose $x_i \sim N(\mu,\sigma^2)$. Have \begin{displaymath} \widehat{\theta} = (\overline{x}, \frac{1}{n}\sum_{i=1}^n(x_i-\overline{x})^2) \end{displaymath} \pause \begin{itemize} \item Write $x_i \sim N(\mu,\sigma)$. \begin{itemize} \item $g(\theta) = (\theta_1,\sqrt{\theta_2})$ \item $\widehat{\theta}^\prime = \left(\overline{x}, \sqrt{\frac{1}{n}\sum_{i=1}^n(x_i-\overline{x})^2}\right)$ \pause \end{itemize} \item Write $x_i \sim N(\mu,\tau)$, where $\tau = 1/\sigma^2$ is called the \emph{precision}. \pause \begin{itemize} \item $g(\theta) = (\theta_1,1/\theta_2)$ \item $\widehat{\theta}^\prime = \left(\overline{x}, \frac{n}{\sum_{i=1}^n(x_i-\overline{x})^2}\right)$ \end{itemize} \end{itemize} \end{frame} \section{Consistency} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Consistency} %\framesubtitle{} \begin{itemize} \item The idea is large-sample accuracy. \item As $n \rightarrow \infty$, you get the truth. \item It's a kind of limit, but with probability involved. % \item It's the least you can ask. \end{itemize} \end{frame} \begin{frame} \frametitle{The setting} %\framesubtitle{} \begin{itemize} \item Let $t_1, t_2, \ldots$ be a sequence of random variables. \pause \item Main application: $t_n$ is an estimator of $\theta$ based on a sample of size $n$. \item Think $t_n = \overline{x}_n = \frac{1}{n}\sum_{i=1}^nx_i$. % \item Generalize to random vectors, soon. \end{itemize} \end{frame} \begin{frame} \frametitle{Definition of Convergence in Probability} We say that $t_n$ converges \emph{in probability} to the constant $\theta$, and write $t_n \stackrel{p}{\rightarrow} \theta$ if for all $\epsilon>0$, {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|t_n-\theta|<\epsilon \}=1 \end{displaymath} } \pause Convergence in probability to $\theta$ means no matter how small the interval around $\theta$, the probability distribution of $t_n$ becomes concentrated in that interval as $n \rightarrow \infty$. \end{frame} \begin{frame} \frametitle{Picture it} %\framesubtitle{} {\small \begin{eqnarray*} P\{|t_n-t|<\epsilon\} \pause & = & P\{-\epsilon < t_n-\theta < \epsilon\} \\ \pause & = & P\{\theta-\epsilon < t_n < \theta+\epsilon\} \pause \end{eqnarray*} } % End size \begin{center} \includegraphics[width=2.5in]{convergence1} \end{center} \end{frame} \begin{frame} \frametitle{Picture it} % Superimpose the second layer, another density %\framesubtitle{} {\small \begin{eqnarray*} P\{|t_n-t|<\epsilon\} & = & P\{-\epsilon < t_n-\theta < \epsilon\} \\ & = & P\{\theta-\epsilon < t_n < \theta+\epsilon\} \end{eqnarray*} } % End size \begin{center} \includegraphics[width=2.5in]{convergence2} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Law of Large Numbers} \framesubtitle{We will use this a lot} \pause Let $x_1, x_2, \ldots$ be independent random variables from a distribution with expected value $\mu$ and variance $\sigma^2$. The Law of Large Numbers says {\huge \begin{displaymath} \overline{x}_n \stackrel{p}{\rightarrow} \mu \end{displaymath} } \end{frame} \begin{frame} \frametitle{Roadmap} %\framesubtitle{} \begin{center} \begin{tabular}{c} Markov's Inequality \\ $\Downarrow$ \\ Variance Rule \\ $\Downarrow$ \\ Law of Large Numbers \end{tabular} \end{center} \end{frame} \begin{frame} \frametitle{Markov's Inequality} %\framesubtitle{} For $g(x) \geq 0$ and $a \geq 0$, {\LARGE \begin{displaymath} E\{g(x)\} \geq a \, Pr\{g(x) \geq a \} \end{displaymath} \pause } % End size To prove, split up the integral. \end{frame} \begin{frame} \frametitle{Variance Rule} %\framesubtitle{} \begin{itemize} \item Let $t_1, t_2, \ldots$ be a sequence of random variables \item With $E(t_n) = \mu_n$ and $Var(t_n) = \sigma^2_n$ \pause \item If ${\displaystyle \lim_{n \rightarrow \infty} \mu_n = \theta}$ and ${\displaystyle \lim_{n \rightarrow \infty} \sigma^2_n = 0}$, then \item[] \end{itemize} {\LARGE \begin{displaymath} t_n \stackrel{p}{\rightarrow} \theta \end{displaymath} \pause } % End size To prove, let $g(x) = (x-\mu)^2$ and $a=\epsilon^2$ in Markov's inequality. \end{frame} \begin{frame} \frametitle{Proving the Law of Large Numbers} %\framesubtitle{} The Variance Rule says \begin{itemize} \item Let $t_1, t_2, \ldots$ be a sequence of random variables \item With $E(t_n) = \mu_n$ and $Var(t_n) = \sigma^2_n$ \item If ${\displaystyle \lim_{n \rightarrow \infty} \mu_n = \theta}$ and ${\displaystyle \lim_{n \rightarrow \infty} \sigma^2_n = 0}$, then $t_n \stackrel{p}{\rightarrow} \theta$. \end{itemize} \pause \vspace{5mm} \begin{itemize} \item Let $t_n = \overline{x}_n$ and $\theta=\mu$. \pause \item $E(\overline{x}_n) = \mu$ and $Var(\overline{x}_n) = \frac{\sigma^2}{n} \rightarrow 0$ \pause \item Conclude \end{itemize} {\LARGE \begin{displaymath} \overline{x}_n \stackrel{p}{\rightarrow} \mu \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{The Change of Variables formula: Let $y = g(x)$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} E(y) = \int_{-\infty}^\infty y \, f_{_y}(y) \, dy = \int_{-\infty}^\infty g(x) \, f_{_x}(x) \, dx \end{displaymath} } \pause Or, for discrete random variables {\LARGE \begin{displaymath} E(y) = \sum_y y \, p_{_y}(y) = \sum_x g(x) \, p_{_x}(x) \end{displaymath} } This is actually a big theorem, not a definition. \end{frame} \begin{frame} \frametitle{Applying the change of variables formula} \framesubtitle{To approximate $E[g(x)]$} Have $x_1, \ldots, x_n$ from the distribution of $x$. Want $E(y)$, where $y=g(x)$. \pause {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n g(x_i) &=& \frac{1}{n}\sum_{i=1}^n y_i \stackrel{p}{\rightarrow} E(y) \\ \\ &=& E(g(x)) \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{So for example} %\framesubtitle{} {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n x_i^k &\stackrel{p}{\rightarrow}& E(x^k) \\ &&\\ \pause \frac{1}{n}\sum_{i=1}^n U_i^2 V_i W_i^3 &\stackrel{p}{\rightarrow}& E(U^2VW^3) \end{eqnarray*} }\pause \vspace{2mm} \begin{itemize} \item That is, sample moments converge in probability to population moments. \pause \item Central sample moments converge to central population moments as well. \end{itemize} \end{frame} \begin{frame} \frametitle{Population moments and sample moments} \framesubtitle{Repeating an earlier slide} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{ll} \hline Population moment & Sample moment \\ \hline $E\{x_i\}$ & $\frac{1}{n}\sum_{i=1}^n x_i$ \\ $E\{x_i^2\}$ & $\frac{1}{n}\sum_{i=1}^n x_i^2$ \\ $E\{x_iy_i\}$ & $\frac{1}{n}\sum_{i=1}^n x_iy_i$ \\ $E\{(x_i-\mu_x)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)^2$ \\ $E\{(x_i-\mu_x)(y_i-\mu_y)\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)(y_i-\overline{y}_n)$ \\ $E\{(x_i-\mu_x)(y_i-\mu_y)^2\}$ & $\frac{1}{n}\sum_{i=1}^n (x_i-\overline{x}_n)(y_i-\overline{y}_n)^2$ \\ \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Convergence in Probability for Random Vectors} %\framesubtitle{} Let $\mathbf{t}_1, \mathbf{t}_2, \ldots$ be a sequence of $k$-dimensional random vectors. We say that $\mathbf{t}_n$ converges in probability to $\boldsymbol{\theta} \in \mathbb{R}^k$, and write $\mathbf{t}_n \stackrel{p}{\rightarrow} \boldsymbol{\theta}$ if for all $\epsilon>0$, {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{||\mathbf{t}_n-\boldsymbol{\theta}||<\epsilon \}=1, \end{displaymath} } \pause where $||\mathbf{a}-\mathbf{b}||$ denotes Euclidian distance in $\mathbb{R}^k$. \end{frame} \begin{frame} \frametitle{Two more Theorems} %\framesubtitle{} \begin{itemize} \item The ``stack" theorem and continuous mapping. \item Often used together. \end{itemize} \end{frame} \begin{frame} \frametitle{The ``Stack" Theorem} \framesubtitle{Because I don't know what to call it.} Let $\mathbf{x}_n \stackrel{p}{\rightarrow} \mathbf{a}$ and $\mathbf{y}_n \stackrel{p}{\rightarrow} \mathbf{b}$. Then the partitioned random vector %{\LARGE \begin{displaymath} \left( \begin{array}{cc} \mathbf{x}_n \\ \mathbf{y}_n \end{array} \right) \stackrel{p}{\rightarrow} \left( \begin{array}{cc} \mathbf{a} \\ \mathbf{b} \end{array} \right) \end{displaymath} %} % End size \end{frame} \begin{frame} \frametitle{Continuous mapping} \framesubtitle{One of the Slutsky lemmas: See Appendix A} Let $\mathbf{t}_n \stackrel{p}{\rightarrow} \mathbf{c}$, and let the function $g(\mathbf{x})$ be continuous at $\mathbf{x}=\mathbf{c}$. Then {\LARGE \begin{displaymath} g(\mathbf{t}_n) \stackrel{p}{\rightarrow} g(\mathbf{c}) \end{displaymath} }% End size \pause \vspace{5mm} Note that the function $g$ could be multidimensional, for example mapping $\mathbb{R}^5$ into $\mathbb{R}^2$. \end{frame} \begin{frame} \frametitle{Definition of Consistency} \pause %\framesubtitle{} The random vector (of statistics) $\mathbf{t}_n$ is said to be a \emph{consistent} estimator of the parameter vector $\boldsymbol{\theta}$ if {\LARGE \begin{displaymath} \mathbf{t}_n \stackrel{p}{\rightarrow} \boldsymbol{\theta} \end{displaymath} \pause } % End size for \underline{all} $\boldsymbol{\theta} \in \Theta$. \end{frame} \begin{frame} \frametitle{Consistency of the Sample Variance } \framesubtitle{This answer gets full marks.}\pause {\small \begin{displaymath} \widehat{\sigma}^2_n = \frac{1}{n}\sum_{i=1}^n (x_i-\overline{x})^2 \pause = \frac{1}{n}\sum_{i=1}^n x_i^2 - \overline{x}^2 \end{displaymath} \pause \vspace{3mm} By LLN, $\overline{x}_n \stackrel{p}{\rightarrow}\mu$ \pause and $\frac{1}{n}\sum_{i=1}^n x_i^2 \stackrel{p}{\rightarrow} E(x_i^2) \pause = \sigma^2+\mu^2$. \pause \vspace{3mm} By continuous mapping, \pause \begin{displaymath} \widehat{\sigma}^2_n = \frac{1}{n}\sum_{i=1}^n x_i^2 - \overline{x}^2 \pause \stackrel{p}{\rightarrow} \pause \sigma^2+\mu^2 - \mu^2 = \sigma^2 \end{displaymath} \pause \vspace{3mm} Note the silent use of the Stack Theorem. } % End size \end{frame} \begin{frame} \frametitle{Method of Moments Estimators are Consistent} \framesubtitle{For most practical cases} \pause Recall \begin{itemize} \item Let $m$ denote a vector of population moments. \item $\widehat{m}$ is the corresponding vector of sample moments. \item Find $m = g(\theta)$ \item Solve for $\theta$, obtaining $\theta= g^{-1}(m)$. \item Let $\widehat{\theta}_n = g^{-1}(\widehat{m}_n)$. \pause \end{itemize} \vspace{3mm} If $g$ is continuous, so is $g^{-1}$. \pause Then by continous mapping, $\widehat{m} \stackrel{p}{\rightarrow} m \Rightarrow \widehat{\theta}_n = g^{-1}(\widehat{m}_n) \stackrel{p}{\rightarrow} g^{-1}(m) \pause = \theta$. \end{frame} \begin{frame} \frametitle{Maximum Likelihood Estimators are Consistent} \framesubtitle{If the model is correct, and given some additional ``regularity conditions."} {\Huge \begin{displaymath} \widehat{\boldsymbol{\theta}}_n \stackrel{p}{\rightarrow} \boldsymbol{\theta} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{Consistency is great but it's not enough.} \begin{itemize} \item It's the least we can ask. Estimators that are \emph{not} consistent are completely unacceptable for most purposes. \pause \item Think of $a_n = 1/n$ as a sequence of degenerate random variables with $P\{a_n = 1/n\}=1$. \item So, $a_n \stackrel{p}{\rightarrow} 0$. \pause \end{itemize} Suppose \vspace{5mm} {\LARGE \begin{displaymath} t_n \stackrel{p}{\rightarrow} \theta \pause \Rightarrow U_n = t_n + \frac{100,000,000}{n} \pause \stackrel{p}{\rightarrow} \theta. \end{displaymath} } \end{frame} \section{Asymptotic Normality} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Convergence in Distribution} \framesubtitle{Sometimes called \emph{Weak Convergence}, or \emph{Convergence in Law}} Denote the cumulative distribution functions of $t_1, t_2, \ldots$ by $F_1(x), F_2(x), \ldots$ respectively, and denote the cumulative distribution function of $t$ by $F(x)$. \pause \vspace{5mm} We say that $t_n$ converges \emph{in distribution} to $t$, and write $t_n \stackrel{d}{\rightarrow} t$ if for every point $x$ at which $F$ is continuous, {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} F_n(x) = F(x) \end{displaymath} } We will seldom use this definition directly. \end{frame} \begin{frame} \frametitle{Connections among the Modes of Convergence} {\LARGE \begin{itemize} \item $ t_n \stackrel{p}{\rightarrow} t \Rightarrow t_n \stackrel{d}{\rightarrow} t $. \vspace{5mm} \item If $a$ is a constant, $ t_n \stackrel{d}{\rightarrow} a \Rightarrow t_n \stackrel{p}{\rightarrow} a$. \end{itemize} } \end{frame} \begin{frame} \frametitle{Univariate Central Limit Theorem} Let $x_1, \ldots, x_n$ be a random sample from a distribution with expected value $\mu$ and variance $\sigma^2$. Then {\LARGE \begin{displaymath} z_n = \frac{\sqrt{n}(\overline{x}_n-\mu)}{\sigma} \stackrel{d}{\rightarrow} z \sim N(0,1) \end{displaymath} } \end{frame} \begin{frame} \frametitle{Sometimes we say the distribution of the sample mean is approximately normal, or asymptotically normal.} %\framesubtitle{} \begin{itemize} \item This is justified by the Central Limit Theorem. \item But it does \emph{not} mean that $\overline{x}_n$ converges in distribution to a normal random variable. \pause \item The Law of Large Numbers says that $\overline{x}_n$ converges in probability to a constant, $\mu$. \item So $\overline{x}_n$ converges to $\mu$ in distribution as well. \end{itemize} \end{frame} \begin{frame} \frametitle{Why would we say that for large $n$, the sample mean is approximately $N(\mu,\frac{\sigma^2}{n})$?} \pause \vspace{5mm} Have $z_n = \frac{\sqrt{n}(\overline{x}_n-\mu)}{\sigma} \pause \stackrel{d}{\rightarrow} z \sim N(0,1)$. \pause {\footnotesize \begin{eqnarray*} Pr\{\overline{x}_n \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(\overline{x}_n-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause \approx \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } \pause Suppose $y$ is \emph{exactly} $N(\mu,\frac{\sigma^2}{n})$: \pause {\footnotesize \begin{eqnarray*} Pr\{y \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(y-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause = \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Asymptotic Normality} %\framesubtitle{} \begin{itemize} \item We say $\overline{x}_n$ is asymptotically normal, with asymptotic mean $\mu$ and asymptotic variance $\frac{\sigma^2}{n}$. \item Write $\overline{x}_n \stackrel{\cdot}{\sim} N(\mu, \frac{\sigma^2}{n})$ \item In tests and confidence intervals, $\frac{\widehat{\sigma}^2}{n}$ may be used in place of $\frac{\sigma^2}{n}$, where $\widehat{\sigma}^2$ is any consistent estimator of $\sigma^2$. \end{itemize} \end{frame} \begin{frame} \frametitle{Asymptotic \emph{Multivariate} Normality} %\framesubtitle{} \begin{itemize} \item Multivariate central limit theorem \item Central limit theorem for vectors of MLEs \end{itemize} \end{frame} \begin{frame} \frametitle{Multivariate central limit theorem} %\framesubtitle{} Let $\mathbf{x}_1, \ldots, \mathbf{x}_n$ be i.i.d.~ $p$-dimensional random vectors with expected value vector $\boldsymbol{\mu}$ and covariance matrix $\boldsymbol{\Sigma}$. Then {\LARGE \begin{displaymath} \sqrt{n}(\overline{\mathbf{x}}_n-\boldsymbol{\mu}) \stackrel{d}{\rightarrow} \mathbf{x} \sim N_p(\mathbf{0},\boldsymbol{\Sigma}) \end{displaymath} } % End size \pause \begin{itemize} \item Say $\overline{\mathbf{x}}_n$ is asymptotically $N_p(\boldsymbol{\mu},\frac{1}{n}\boldsymbol{\Sigma})$. \item Write $\overline{\mathbf{x}}_n \stackrel{\cdot}{\sim} N_p(\boldsymbol{\mu},\frac{1}{n}\boldsymbol{\Sigma})$. \item The asymptotic covariance matrix of $\overline{\mathbf{x}}_n$ is $\frac{1}{n}\boldsymbol{\Sigma}$. \item $\boldsymbol{\Sigma}$ may be estimated by the sample variance-covariance matrix $\boldsymbol{\widehat{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{x}_i-\overline{\mathbf{x}}) (\mathbf{x}_i-\overline{\mathbf{x}})^\top $. \end{itemize} \end{frame} \begin{frame} \frametitle{Central limit theorem for vectors of MLEs} \framesubtitle{$\boldsymbol{\theta} = (\theta_1, \ldots, \theta_k)$} If the model is correct and under some technical conditions that always hold for the models used in this class, {\LARGE \begin{displaymath} \sqrt{n}(\widehat{\boldsymbol{\theta}}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{t} \sim N_k(\mathbf{0},\mathcal{I}(\boldsymbol{\theta})^{-1}), \end{displaymath} } % End size \noindent where (for the record) $\mathcal{I}(\boldsymbol{\theta})$ is the Fisher information matrix. \begin{displaymath} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta}) = \left[E[-\frac{\partial^2}{\partial\theta_i\partial\theta_j} \log f(y;\boldsymbol{\theta})]\right] \pause \end{displaymath} See Appendix A. \end{frame} \begin{frame} \frametitle{Asymptotic Multivariate Normality of the MLEs} %\framesubtitle{} \begin{itemize} \item Say $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k(\boldsymbol{\theta},\mathbf{V}_n)$, where $\mathbf{V}_n = \frac{1}{n}\mathcal{I}(\boldsymbol{\theta})^{-1}$. \item Write $\widehat{\boldsymbol{\theta}}_n \stackrel{\cdot}{\sim} N_k(\boldsymbol{\theta},\mathbf{V}_n)$. \pause \item For tests and confidence intervals replace $\mathbf{V}_n$ by either \begin{itemize} \item $\widehat{\mathbf{V}}_n = \frac{1}{n}\mathcal{I}(\widehat{\boldsymbol{\theta}})^{-1}$, or \item $\widehat{\mathbf{V}}_n =$ the inverse of the Hessian of the minus log likelihood, evaluated at the MLE. \item For numerical MLEs, the second choice is usually more convenient. \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Back to the Gamma Example} \framesubtitle{\texttt{gsearch = optim(par=c(momalpha,mombeta), fn = gmll, \\ \hspace{15mm} method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d)}} \begin{tabular}{lc} Output & Meaning \\ \hline &\\ \begin{minipage}{2in} \begin{verbatim} $par [1] 1.805930 3.808674 \end{verbatim} \end{minipage} & $\widehat{\boldsymbol{\theta}} = (\widehat{\alpha}, \widehat{\beta})$ \\ &\\ \begin{minipage}{2in} \begin{verbatim} $value [1] 142.0316 \end{verbatim} \end{minipage} & $-\ell(\widehat{\boldsymbol{\theta}})$ \\ &\\ \begin{minipage}{2in} \begin{verbatim} $hessian [,1] [,2] [1,] 36.69402 13.127928 [2,] 13.12793 6.224773 \end{verbatim} \end{minipage} & $\mathbf{H} = \left[\frac{\partial^2 (-\ell)} {\partial\theta_i\partial\theta_j}\right]_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}}$ \end{tabular} \end{frame} % herehere \begin{frame}[fragile] \frametitle{ } %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # Asymptotic variance-covariance matrix is the inverse of the > # Fisher Information > Vhat_n = solve(gsearch$hessian); Vhat_n } $ [,1] [,2] [1,] 0.1110190 -0.2341369 [2,] -0.2341369 0.6544386 {\color{blue}> # Confidence interval for alpha (true value is 2) > thetahat } alpha-hat beta-hat 1.805930 3.808674 {\color{blue}> se_alphahat = sqrt(Vhat_n[1,1]) > lower95 = thetahat[1] - 1.96*se_alphahat > upper95 = thetahat[1] + 1.96*se_alphahat > c(lower95,upper95) } alpha-hat alpha-hat 1.152868 2.458992 \end{alltt} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/431s23} {\footnotesize \texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} % \stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}_i} # Picture of convergence in probability, using R # Layer 1 rm(list=ls()) x = seq(from=-3,to=3,by=0.01) sigma=1/2; raise=0.25; z=1.3 Density = dnorm(x,mean=0,sd=sigma)+raise plot(x,Density,axes=F,xlab='',ylab='',pch=' ') lines(x,dnorm(x)+raise,lty=2) lines(c(-3,3),c(raise,raise),lty=1) text(x=0,y=raise-0.015,expression(theta)) text(x=-z+.20,y=raise-0.015,expression(paste("(",theta-epsilon))) text(x= z-.20,y=raise-0.015,expression(paste(theta+epsilon,")"))) lines(c(-z,-z),c(raise,raise+dnorm(-z)),lty=2) lines(c(z,z),c(raise,raise+dnorm(z)),lty=2) text(x=0,y=dnorm(0,sd=sigma)+raise,".") # Guide for cropping # Save this picture as convergence1.pdf, and proceed ######################################################################### # Layer 2 lines(x,Density,lty=1) lines(c(-z,-z),c(raise,raise+dnorm(-z,sd=sigma)),lty=1) lines(c(z,z),c(raise,raise+dnorm(z,sd=sigma)),lty=1) # Save as convergence2.pdf ######################################################################### ######################################################################### ######################################################################### # Gamma MLE rm(list=ls()) # Simulate data set.seed(3201); n = 50; alpha=2; beta=3 d = round(rgamma(50,shape=alpha, scale=beta),2); d mean(d); alpha*beta # Compare var(d); alpha*beta^2 gmll = function(theta,datta) { aa = theta[1]; bb = theta[2] nn = length(datta); sumd = sum(datta) sumlogd = sum(log(datta)) value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd return(value) } # End function gmll # help(optim) # MOM for starting values momalpha = mean(d)^2/var(d); momalpha mombeta = var(d)/mean(d); mombeta # Error message says: "Bounds can only be used with method # L-BFGS-B (or Brent)" gsearch = optim(par=c(momalpha,mombeta), fn = gmll, method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d) gsearch thetahat = gsearch$par names(thetahat) = c("alpha-hat","beta-hat"); thetahat # Second derivative test eigen(gsearch$hessian)$values ############## Later ############## # Had thetahat = gsearch$par thetahat # Asymptotic variance-covariance matrix is the inverse of the # Fisher Information Vhat_n = solve(gsearch$hessian); Vhat_n # Confidence interval for alpha (true value is 2) thetahat se_alphahat = sqrt(Vhat_n[1,1]) lower95 = thetahat[1] - 1.96*se_alphahat upper95 = thetahat[1] + 1.96*se_alphahat c(lower95,upper95) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%