% Large sample tools for Applied Stat I % Notes and comments are after the end of the document % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{euscript} % for \EuScript % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Large sample tools\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2018} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading: Davison's \emph{Statistical models}} %\framesubtitle{} \begin{itemize} \item See Section 2.2 (Pages 28-37) on convergence. \item Section 3.3 (Pages 77-90) goes more deeply into simulation than we will. At least skim it. \end{itemize} \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Foundations} \begin{frame} \frametitle{Sample Space $\Omega$, $\omega \in \Omega$} % \framesubtitle{Usually infinite in theory} \begin{itemize} \item $\Omega$ is a set, the underlying sample space. \pause \item It could literally be the universe of websites from which we intend to sample. \pause \item $\EuScript{F}$ is a class of subsets of $\Omega$. \pause \item If could be the class of all subsets (if $\Omega$ is countable). \pause \item There is a probability measure $\EuScript{P}$ defined on the elements of $\EuScript{F}$. \pause \item Maybe each website is equally likely to be chosen (with replacement). \end{itemize} \end{frame} \begin{frame} \frametitle{Random variables are functions from $\Omega$ into the set of real numbers} \pause {\LARGE \begin{displaymath} Pr\{X \in B\} = Pr(\{\omega \in \Omega: X(\omega) \in B \}) \end{displaymath} } \end{frame} \begin{frame} \frametitle{Random Sample $X_1(\omega), \ldots, X_n(\omega)$} \pause %\framesubtitle{} \begin{itemize} \item $T = T(X_1, \ldots, X_n)$ \pause \item $T = T_n(\omega)$ \pause \item Let $n \rightarrow \infty$ to see what happens for large samples \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{{\LARGE Modes of Convergence}} \pause %\framesubtitle{} {\LARGE \begin{itemize} \item Almost Sure Convergence \item Convergence in Probability \item Convergence in Distribution \end{itemize} } \end{frame} \begin{frame} \frametitle{Almost Sure Convergence} We say that $T_n$ converges \emph{almost surely} to $T$, and write $T_n \stackrel{a.s.}{\rightarrow} T$ if \pause \begin{displaymath} Pr\{\omega:\, \lim_{n \rightarrow \infty} T_n(\omega) = T(\omega)\}=1. \end{displaymath} \pause \begin{itemize} \item Acts like an ordinary limit, except possibly on a set of probability zero. \pause \item All the usual rules apply. \pause \item Called convergence with probability one or sometimes strong convergence. \pause \item In this course, convergence will usually be to a constant.\pause \end{itemize} \begin{displaymath} Pr\{\omega:\, \lim_{n \rightarrow \infty} T_n(\omega) = c \}=1. \end{displaymath} \end{frame} \section{LLN} \begin{frame} \frametitle{Strong Law of Large Numbers} %\framesubtitle{} Let $X_1, \ldots, X_n$ be independent with common expected value $\mu$. \pause \vspace{10mm} {\huge \begin{displaymath} \overline{X}_n \stackrel{a.s.}{\rightarrow} E(X_i) = \mu \end{displaymath} } \pause The only condition required for this to hold is the existence of the expected value. \end{frame} \begin{frame} \frametitle{Probability is long run relative frequency} \pause \begin{itemize} \item Statistical experiment: Probability of ``success" is $\theta$. \pause \item Carry out the experiment many times independently. \pause \item Code the results $X_i=1$ if success, $X_i=0$ for failure, $i = 1, 2, \ldots$ \end{itemize} \end{frame} \begin{frame} \frametitle{Sample proportion of successes converges to the probability of success} \pause % \framesubtitle{} Recall $X_i=0$ or $1$. \pause {\Large \begin{eqnarray*} E(X_i) &=& \sum_{x=0}^1 x \, Pr\{X_i = x\} \\ \pause % &&\\ &=& 0\cdot (1-\theta) + 1\cdot \theta \\ \pause % &&\\ &=& \theta \end{eqnarray*} } \pause Relative frequency is \pause {\Large \begin{displaymath} \frac{1}{n}\sum_{i=1}^n X_i \pause = \overline{X}_n \pause \stackrel{a.s.}{\rightarrow} \theta \end{displaymath} } \end{frame} \begin{frame} \frametitle{Simulation} \framesubtitle{Using pseudo-random number generation by computer} \pause \begin{itemize} \item Estimate almost any probability that's hard to figure out \pause \item Statistical power \pause \item Weather model \pause \item Performance of statistical methods \pause \item Need confidence intervals for estimated probabilities. \end{itemize} \end{frame} \begin{frame} \frametitle{Estimating power by simulation} \pause Example: Bernoulli random sampling. \pause \vspace{2mm} Recall the two test statistics for testing $H_0: \theta=\theta_0$: \pause \begin{itemize} \item $Z_1 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}}$ \item $Z_2 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}}$ \end{itemize} \pause When $\theta \neq \theta_0$, calculating $P\{|Z_2|>z_{\alpha/2} \}$ can be challenging. \end{frame} \begin{frame} \frametitle{Strategy for estimating power by simulation} \pause %\framesubtitle{} \begin{itemize} \item Generate a large number of random data sets under the alternative hypothesis. \pause \item For each data set, test $H_0$. \pause \item Estimated power is the proportion of times $H_0$ is rejected. \pause \item How accurate is the estimate? \pause \item $\widehat{p} ~ \pm ~ z_{\alpha/2}\sqrt{\frac{\widehat{p}(1-\widehat{p})}{m}}$ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Testing $H_0: \theta = 0.50$ when true $\theta=0.60$ and $n=100$} \framesubtitle{Power of $Z_1$ was about 0.52} \pause \begin{displaymath} Z_2 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}} \end{displaymath} \pause {\footnotesize % or scriptsize \begin{verbatim} > # Power by simulation > set.seed(9999) > m = 10000 # Monte Carlo sample size > theta=0.60; theta0 = 1/2; n = 100 > Ybar = rbinom(m,size=n,prob=theta)/n # A vector of length m > Z2 = sqrt(n)*(Ybar-theta0)/sqrt(Ybar*(1-Ybar)) # Another vector of length m > power = length(Z2[abs(Z2>1.96)])/m; power [1] 0.5394 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Margin of error for estimated power} % \framesubtitle{} \pause Confidence interval for an estimated probability was \begin{displaymath} \widehat{p} \pm z_{\alpha/2}\sqrt{\frac{\widehat{p}(1-\widehat{p})}{n}} \end{displaymath} \pause {\footnotesize % or scriptsize \begin{verbatim} # How about a 99 percent margin of error > a = 0.005; z = qnorm(1-a) > merror = z * sqrt(power*(1-power)/m); merror [1] 0.0128391 > Lower = power - merror; Lower [1] 0.5265609 > Upper = power + merror; Upper [1] 0.5522391 \end{verbatim} } % End size \end{frame} \begin{frame} \frametitle{Recall the Change of Variables formula: Let $Y = g(X)$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} E(Y) = \int_{-\infty}^\infty y \, f_{_Y}(y) \, dy \pause = \int_{-\infty}^\infty g(x) \, f_{_X}(x) \, dx \end{displaymath} } \pause Or, for discrete random variables \pause {\LARGE \begin{displaymath} E(Y) = \sum_y y \, p_{_Y}(y) = \sum_x g(x) \, p_{_X}(x) \end{displaymath} } \pause This is actually a big theorem, not a definition. \end{frame} \begin{frame} \frametitle{Applying the change of variables formula} \framesubtitle{To approximate $E[g(X)]$} \pause Simulate $X_1, \ldots, X_n$ from the distribution of $X$. \pause Calculate {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n g(X_i) \pause &=& \frac{1}{n}\sum_{i=1}^n Y_i \pause \stackrel{a.s.}{\rightarrow} E(Y) \\ \\ \pause &=& E(g(X)) \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{So for example} %\framesubtitle{} {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n X_i^k &\stackrel{a.s.}{\rightarrow}& E(X^k) \\ &&\\ \pause \frac{1}{n}\sum_{i=1}^n U_i^2 V_i W_i^3 &\stackrel{a.s.}{\rightarrow}& E(U^2VW^3) \end{eqnarray*} } \pause \vspace{5mm} That is, sample moments converge almost surely to population moments. \end{frame} \begin{frame} \frametitle{Approximate an integral: $\int_{-\infty}^{\infty} h(x) \, dx$} \framesubtitle{Where $h(x)$ is a nasty function.} \pause Let $f(x)$ be a density with $f(x)>0$ wherever $h(x)\neq 0$. \pause \begin{eqnarray*} \int_{-\infty}^{\infty} h(x) \, dx & = & \int_{-\infty}^{\infty} \frac{h(x)}{f(x)} f(x) \, dx \\ \pause & = & E\left[ \frac{h(X)}{f(X)}\right] \\ \pause & = & E[g(X)], \end{eqnarray*} \pause So \begin{itemize} \item Sample $X_1, \ldots, X_n$ from the distribution with density $f(x)$ \pause \item Calculate $Y_i = g(X_i) = \frac{h(X_i)}{f(X_i)}$ for $i=1, \ldots, n$ \pause \item Calculate $\overline{Y}_n \stackrel{a.s.}{\rightarrow} E[Y]= E[g(X)]$ \pause \item Confidence interval for $\mu = E[Y]$ is routine. \end{itemize} \end{frame} \begin{frame} \frametitle{Convergence in Probability} We say that $T_n$ converges \emph{in probability} to $T$, \pause and write $T_n \stackrel{P}{\rightarrow} T$ \pause if for all $\epsilon>0$, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{\omega: |T_n(\omega)-T(\omega)| < \epsilon \}=1 \end{displaymath} } \pause For us, convergence will usually be to a constant: {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|T_n-c|<\epsilon \}=1 \end{displaymath} } \pause Convergence in probability (say to $c$) means no matter how small the interval around $c$, for large enough $n$ (that is, for all $n>N_1$) the probability of getting that close to $c$ is as close to one as you like. \pause We will seldom use the definition in this class. \end{frame} \begin{frame} \frametitle{Weak Law of Large Numbers} {\huge \begin{displaymath} \overline{X}_n \stackrel{p}{\rightarrow} \mu \end{displaymath} } \pause \begin{itemize} \item Almost Sure Convergence implies Convergence in Probability \item Strong Law of Large Numbers implies Weak Law of Large Numbers \end{itemize} \end{frame} \section{Consistency} \begin{frame} \frametitle{Consistency} \framesubtitle{$T = T(X_1, \ldots, X_n)$ is a statistic estimating a parameter $\theta$} \pause The statistic $T_n$ is said to be \emph{consistent} for $\theta$ if $T_n \stackrel{P}{\rightarrow} \theta$ \pause for all $\theta$ in the parameter space. \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|T_n-\theta|<\epsilon \}=1 \end{displaymath} } \pause \vspace{5mm} The statistic $T_n$ is said to be \emph{strongly consistent} for $\theta$ if $T_n \stackrel{a.s.}{\rightarrow} \theta$. \pause \vspace{5mm} Strong consistency implies ordinary consistency. \end{frame} \begin{frame} \frametitle{Consistency is great but it's not enough.} \pause \begin{itemize} \item It means that as the sample size becomes indefinitely large, you probably get as close as you like to the truth. \pause \item It's the least we can ask. Estimators that are not consistent are completely unacceptable for most purposes. \end{itemize} \pause {\LARGE \begin{displaymath} T_n \stackrel{a.s.}{\rightarrow} \theta \pause \Rightarrow U_n = T_n + \frac{100,000,000}{n} \pause \stackrel{a.s.}{\rightarrow} \theta \end{displaymath} } \end{frame} \begin{frame} \frametitle{Consistency of the Sample Variance } \pause %{\LARGE \begin{eqnarray*} \widehat{\sigma}^2_n &=& \frac{1}{n}\sum_{i=1}^n (X_i-\overline{X})^2 \\ \\ \pause &=& \frac{1}{n}\sum_{i=1}^n X_i^2 - \overline{X}^2 \end{eqnarray*} %} \pause \vspace{5mm} By SLLN, $\overline{X}_n \stackrel{a.s.}{\rightarrow}\mu$ and $\frac{1}{n}\sum_{i=1}^n X_i^2 \stackrel{a.s.}{\rightarrow} E(X^2) = \sigma^2+\mu^2$. \pause \vspace{5mm} Because the function $g(x,y)=x-y^2$ is continuous, \pause \vspace{5mm} \begin{displaymath} \widehat{\sigma}^2_n = g\left(\frac{1}{n}\sum_{i=1}^n X_i^2,\overline{X}_n\right) \pause \stackrel{a.s.}{\rightarrow} g(\sigma^2+\mu^2,\mu) \pause = \sigma^2+\mu^2 - \mu^2 = \pause \sigma^2 \end{displaymath} \end{frame} \section{CLT} \begin{frame} \frametitle{Convergence in Distribution} \framesubtitle{Sometimes called \emph{Weak Convergence}, or \emph{Convergence in Law}} \pause Denote the cumulative distribution functions of $T_1, T_2, \ldots$ by $F_1(t), F_2(t), \ldots$ respectively, and denote the cumulative distribution function of $T$ by $F(t)$. \pause \vspace{5mm} We say that $T_n$ converges \emph{in distribution} to $T$, and write $T_n \stackrel{d}{\rightarrow} T$ if \pause for every point $t$ at which $F$ is continuous, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} F_n(t) = F(t) \end{displaymath} } \pause Again, we will seldom use this definition directly. \end{frame} \begin{frame} \frametitle{Univariate Central Limit Theorem} Let $X_1, \ldots, X_n$ be a random sample from a distribution with expected value $\mu$ and variance $\sigma^2$. Then \pause {\LARGE \begin{displaymath} Z_n = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \stackrel{d}{\rightarrow} Z \sim N(0,1) \end{displaymath} } \end{frame} \begin{frame} \frametitle{Connections among the Modes of Convergence} \pause {\LARGE \begin{itemize} \item $ T_n \stackrel{a.s.}{\rightarrow} T \pause \Rightarrow \pause T_n \stackrel{p}{\rightarrow} T \Rightarrow \pause T_n \stackrel{d}{\rightarrow} T $. \pause \vspace{5mm} \item If $a$ is a constant, \pause $ T_n \stackrel{d}{\rightarrow} a \Rightarrow T_n \stackrel{p}{\rightarrow} a$. \end{itemize} } \end{frame} \begin{frame} \frametitle{Sometimes we say the distribution of the sample mean is approximately normal, or asymptotically normal.} \pause %\framesubtitle{} \begin{itemize} \item This is justified by the Central Limit Theorem. \pause \item But it does \emph{not} mean that $\overline{X}_n$ converges in distribution to a normal random variable. \pause \item The Law of Large Numbers says that $\overline{X}_n$ converges almost surely (and in probability) to a constant, $\mu$. \pause \item So $\overline{X}_n$ converges to $\mu$ in distribution as well. \end{itemize} \end{frame} \begin{frame} \frametitle{Why would we say that for large $n$, the sample mean is approximately $N(\mu,\frac{\sigma^2}{n})$?} \pause \vspace{5mm} Have $Z_n = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \pause \stackrel{d}{\rightarrow} Z \sim N(0,1)$. \pause {\footnotesize \begin{eqnarray*} Pr\{\overline{X}_n \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ Z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause \approx \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } \pause Suppose $Y$ is \emph{exactly} $N(\mu,\frac{\sigma^2}{n})$: \pause {\footnotesize \begin{eqnarray*} Pr\{Y \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(Y-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ Z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause = \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } % End size \end{frame} \section{Convergence of random vectors} \begin{frame}[allowframebreaks] % Continue frame onto several slides. Pause does not seem to work. \frametitle{Convergence of random vectors} {\footnotesize \begin{enumerate} \item Definitions (All quantities in boldface are vectors in $\mathbb{R}^m$ unless otherwise stated ) \begin{enumerate} \item[$\star$] $ \mathbf{T}_n \stackrel{a.s.}{\rightarrow} \mathbf{T}$ means $P\{\omega:\, \lim_{n \rightarrow \infty} \mathbf{T}_n(\omega) = \mathbf{T}(\omega)\}=1$. \item[$\star$] $ \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ means $\forall \epsilon>0,\,\lim_{n \rightarrow \infty} P\{||\mathbf{T}_n-\mathbf{T}||<\epsilon \}=1$. \item[$\star$] $ \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ means for every continuity point $\mathbf{t}$ of $F_\mathbf{T}$, $\lim_{n \rightarrow \infty}F_{\mathbf{T}_n}(\mathbf{t}) = F_\mathbf{T}(\mathbf{t})$. \end{enumerate} \item $ \mathbf{T}_n \stackrel{a.s.}{\rightarrow} \mathbf{T} \Rightarrow \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T} \Rightarrow \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T} $. \item If $\mathbf{a}$ is a vector of constants, $ \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{a} \Rightarrow \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{a}$. \item Strong Law of Large Numbers (SLLN): Let $\mathbf{X}_1, \ldots \mathbf{X}_n$ be independent and identically distributed random vectors with finite first moment, and let $\mathbf{X}$ be a general random vector from the same distribution. Then $ \overline{\mathbf{X}}_n \stackrel{a.s.}{\rightarrow} E(\mathbf{X})$. \item Central Limit Theorem: Let $\mathbf{X}_1, \ldots, \mathbf{X}_n$ be i.i.d. random vectors with expected value vector $\boldsymbol{\mu}$ and covariance matrix $\boldsymbol{\Sigma}$. Then $\sqrt{n}(\overline{\mathbf{X}}_n-\boldsymbol{\mu})$ converges in distribution to a multivariate normal with mean \textbf{0} and covariance matrix $\boldsymbol{\Sigma}$. \framebreak \item \label{slutd} Slutsky Theorems for Convergence in Distribution: \begin{enumerate} \item \label{slutcond} If $\mathbf{T}_n \in \mathbb{R}^m$, $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and if $f:\,\mathbb{R}^m \rightarrow \mathbb{R}^q$ (where $q \leq m$) is continuous except possibly on a set $C$ with $P(\mathbf{T} \in C)=0$, then $f(\mathbf{T}_n) \stackrel{d}{\rightarrow} f(\mathbf{T})$. \item \label{slutdiffd} If $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and $(\mathbf{T}_n - \mathbf{Y}_n) \stackrel{P}{\rightarrow} 0$, then $\mathbf{Y}_n \stackrel{d}{\rightarrow} \mathbf{T}$. \item \label{slutstackd} If $\mathbf{T}_n \in \mathbb{R}^d$, $\mathbf{Y}_n \in \mathbb{R}^k$, $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{c}$, then \begin{displaymath} \left( \begin{array}{cc} \mathbf{T}_n \\ \mathbf{Y}_n \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{cc} \mathbf{T} \\ \mathbf{c} \end{array} \right) \end{displaymath} \end{enumerate} \framebreak \item \label{slutp} Slutsky Theorems for Convergence in Probability: \begin{enumerate} \item \label{slutconp} If $\mathbf{T}_n \in \mathbb{R}^m$, $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and if $f:\,\mathbb{R}^m \rightarrow \mathbb{R}^q$ (where $q \leq m$) is continuous except possibly on a set $C$ with $P(\mathbf{T} \in C)=0$, then $f(\mathbf{T}_n) \stackrel{P}{\rightarrow} f(\mathbf{T})$. \item \label{slutdiffp} If $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and $(\mathbf{T}_n - \mathbf{Y}_n) \stackrel{P}{\rightarrow} 0$, then $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{T}$. \item \label{slutstackp} If $\mathbf{T}_n \in \mathbb{R}^d$, $\mathbf{Y}_n \in \mathbb{R}^k$, $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{Y}$, then \begin{displaymath} \left( \begin{array}{cc} \mathbf{T}_n \\ \mathbf{Y}_n \end{array} \right) \stackrel{P}{\rightarrow} \left( \begin{array}{cc} \mathbf{T} \\ \mathbf{Y} \end{array} \right) \end{displaymath} \end{enumerate} \framebreak \item \label{delta} Delta Method (Theorem of Cram\'{e}r, Ferguson p. 45): Let $g: \mathbb{R}^d \rightarrow \mathbb{R}^k$ be such that the elements of \.{g}$(\mathbf{x}) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ are continuous in a neighborhood of $\boldsymbol{\theta} \in \mathbb{R}^d$. If $\mathbf{T}_n$ is a sequence of $d$-dimensional random vectors such that $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T}$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{T}$. In particular, if $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N(\mathbf{0},\mathbf{\Sigma})$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\theta})\mathbf{\Sigma}\mbox{\.{g}}(\boldsymbol{\theta}) ^\prime)$. \end{enumerate} } \end{frame} \begin{frame} \frametitle{An application of the Slutsky Theorems} \begin{itemize} \item Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim}\,?(\mu,\sigma^2)$ \pause \item By CLT, $Y_n = \sqrt{n}(\overline{X}_n-\mu) \stackrel{d}{\rightarrow} Y \sim N(0,\sigma^2)$ \pause \item Let $\widehat{\sigma}_n$ be \emph{any} consistent estimator of $\sigma$. \pause \item Then by \ref{slutd}.\ref{slutstackd}, $\mathbf{T}_n = \left( \begin{array}{cc} Y_n \\ \widehat{\sigma}_n \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{cc} Y \\ \sigma \end{array} \right) = \mathbf{T} $ \pause \item The function $f(x,y)=x/y$ is continuous except if $y=0$ \\ so by \ref{slutd}.\ref{slutcond}, \pause \end{itemize} \begin{displaymath} f(\mathbf{T}_n) = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\widehat{\sigma}_n} \pause \stackrel{d}{\rightarrow} f(\mathbf{T}) \pause = \frac{Y}{\sigma} \pause \sim N(0,1) \end{displaymath} \end{frame} \section{Delta Method} \begin{frame} \frametitle{Univariate delta method} In the multivariate Delta Method~\ref{delta}, the matrix $\mbox{\.{g}}(\boldsymbol{\theta})$ is a Jacobian. \pause The univariate version of the delta method says that \pause if $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$ \pause and $g^{\prime\prime}(x)$ is continuous in a neighbourhood of $\theta$, then \pause \begin{displaymath} \sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T. \end{displaymath} \vspace{5mm} \pause When using the Central Limit Theorem, \emph{especially} if there is a $\theta \neq \mu$ in the model, it's safer to write \pause \begin{displaymath} \sqrt{n}\left( g(\overline{X}_n)- g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu) \, T. \end{displaymath} \pause and then substitute for $\mu$ in terms of $\theta$. \end{frame} % UV delta method comes from Taylor's Theorem -- slide? \begin{frame} \frametitle{Example: Geometric distribution} %\framesubtitle{} Let $X_1, \ldots, X_n$ be a random sample from a distribution,with probability mass function $p(x|\theta) = \theta (1-\theta)^{x-1}$ for $x = 1, 2, \ldots$, where $0<\theta<1$. \vspace{3mm} \pause So, $E(X_i)= \frac{1}{\theta}$ and $Var(X_i)= \frac{1-\theta}{\theta^2}$. \vspace{3mm} \pause The maximum likelihood estimator of $\theta$ is $\widehat{\theta} = \frac{1}{\overline{X}_n}$. Using the Central Limit Theorem and the delta method, find the approximate large-sample distribution of $\widehat{\theta}$. \end{frame} \begin{frame} \frametitle{Solution: Geometric distribution} \framesubtitle{$\mu=\frac{1}{\theta}$ and $\sigma^2 = \frac{1-\theta}{\theta^2}$} \begin{itemize} \item[] CLT says $\sqrt{n}\left( \overline{X}_n- \mu \right) \stackrel{d}{\rightarrow} T \sim N(0,\frac{1-\theta}{\theta^2})$ \pause \item[] Delta method says $\sqrt{n}\left( g(\overline{X}_n)- g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu) \, T$. \pause \item[] $g(x) = \frac{1}{x} = x^{-1}$ \pause \item[] $g^\prime (x) = -x^{-2}$ \pause \item[] So, \pause \end{itemize} \begin{eqnarray*} \sqrt{n}\left( g(\overline{X}_n)- g(\mu)\right) & = & \pause \sqrt{n}\left( \frac{1}{\overline{X}_n} - \frac{1}{\mu}\right) \\ \pause & = & \sqrt{n}\left( \widehat{\theta} - \theta\right) \\ \pause & \stackrel{d}{\rightarrow} & g^\prime(\mu) \, T = -\frac{1}{\mu^2} \, T \\ \pause & = & -\theta^2 \, T \pause \sim N\left(0, \theta^4 \cdot\frac{1-\theta}{\theta^2} \right) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Asymptotic distribution of $\widehat{\theta} = \frac{1}{\overline{X}_n}$} \framesubtitle{Approximate large-sample distribution} \pause Have $Y_n = \sqrt{n}\left( \widehat{\theta} - \theta\right) \stackrel{\cdot}{\sim} N(0,\theta^2(1-\theta))$. \pause \vspace{5mm} \begin{itemize} \item[] So $\frac{Y_n}{\sqrt{n}} = \left( \widehat{\theta} - \theta\right) \stackrel{\cdot}{\sim} N\left(0,\frac{\theta^2(1-\theta)}{n}\right)$ \pause \item[] And $\frac{Y_n}{\sqrt{n}} + \theta \pause = \widehat{\theta} \pause \stackrel{\cdot}{\sim} N\left(\theta,\frac{\theta^2(1-\theta)}{n}\right)$ \pause \end{itemize} \vspace{5mm} We'll say that $\widehat{\theta} = \frac{1}{\overline{X}_n}$ is approximately (or asymptotically) $N\left(\theta,\frac{\theta^2(1-\theta)}{n}\right)$. \end{frame} \begin{frame} \frametitle{Another example of $\sqrt{n}\left( g(\overline{X}_n)- g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu) \, T$} \framesubtitle{Don't lose your head} \pause \begin{itemize} \item[] Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim}\,?(\mu,\sigma^2)$ \pause \item[] CLT says $\sqrt{n}(\overline{X}_n-\mu) \stackrel{d}{\rightarrow} T \sim N(0,\sigma^2)$ \pause \item[] Let $g(x)=x^2$ \pause \item[] Delta method says $\sqrt{n}\left( g(\overline{X}_n)- g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu) \, T$. \pause \item[] So $\sqrt{n}\left( \overline{X}_n^2- \mu^2 \right) \stackrel{d}{\rightarrow} 2\mu \, T \sim N(0,4\mu^2\sigma^2)$ \pause \item[] Really? What if $\mu=0$? \pause \item[] \item[] If $\mu=0$ then $\sqrt{n}\left( \overline{X}_n^2- \mu^2 \right) = \sqrt{n} \, \overline{X}_n^2$ \pause $\stackrel{d}{\rightarrow} 2 \mu T$ \pause $=0$ \pause $\Rightarrow \sqrt{n} \, \overline{X}_n^2 \stackrel{p}{\rightarrow} 0$. \pause \item[] Already know from continuous mapping that $\overline{X}_n^2 \stackrel{p}{\rightarrow} \mu^2 = 0$. \pause \item[] Delta method reveals \emph{faster convergence}. \end{itemize} \end{frame} \begin{frame} \frametitle{Also \ldots} \pause % \framesubtitle{} Have $\sqrt{n} \, \overline{X}_n^2 \stackrel{p}{\rightarrow} 0$. \pause If we add another $\sqrt{n}$ \pause and if (say) $\sigma^2=1$ as well as $\mu=0$, \vspace{7mm} \pause {\Large \begin{displaymath} n \overline{X}_n^2 = \left( \sqrt{n}(\overline{X}_n-\mu) \right)^2 \pause \stackrel{d}{\rightarrow} Z^2 \pause \sim \chi^2(1) \end{displaymath} } % End size \vspace{7mm} \pause If $\sigma^2 \neq 1$, the target is Gamma($\alpha= \frac{1}{2}, \, \beta = 2\sigma$) \end{frame} \begin{frame} \frametitle{The delta method comes from Taylor's Theorem} \pause \textbf{Taylor's Theorem}: Let the $n$th derivative $f^{(n)}$ be continuous in $[a,b]$ and differentiable in $(a,b)$, with $x$ and $x_0$ in $(a,b)$. Then there exists a point $\xi$ between $x$ and $x_0$ such that \pause \begin{eqnarray*} f(x) & = & f(x_0) \;+\; f^\prime(x_0)\,(x-x_0) \;+\; \frac{f^{\prime\prime}(x_0)(x-x_0)^2}{2!} \;+\; \ldots \\ & + & \frac{f^{(n)}(x_0)(x-x_0)^n}{n!} \;+\; \frac{f^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!} \end{eqnarray*} \pause where $R_n = \frac{f^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!}$ is called the \emph{remainder term}. \pause If $R_n \rightarrow 0$ as $n \rightarrow \infty$, \pause the resulting infinite series is called the \emph{Taylor Series} for $f(x)$. \end{frame} \begin{frame} \frametitle{Taylor's Theorem with two terms plus remainder} \framesubtitle{Very common in applications} \pause Let $g(x)$ be a function for which $g^{\prime\prime}(x)$ is continuous in an open interval containing $x=\theta$. \pause Then {\Large \begin{displaymath} g(x) = \pause g(\theta) + g^\prime(\theta)(x-\theta) + \frac{g^{\prime\prime}(\theta^*)(x-\theta)^2}{2!} \end{displaymath} \pause } % End size where $\theta^*$ is between $x$ and $\theta$. \end{frame} \begin{frame} \frametitle{Delta method} \framesubtitle{Using $g(x) = g(\theta) + g^\prime(\theta)(x-\theta) + \frac{1}{2}g^{\prime\prime}(\theta^*)(x-\theta)^2$} \pause Let $\sqrt{n}(T_n-\theta) \stackrel{d}{\rightarrow} T$ \pause so that $T_n \stackrel{p}{\rightarrow} \theta$. \pause {\footnotesize \begin{eqnarray*} \sqrt{n}\left({\color{red}g(T_n)}-g(\theta)\right) \pause & = & \sqrt{n}\left( {\color{red} g(\theta) + g^\prime(\theta)(T_n-\theta) + \frac{1}{2}g^{\prime\prime}(\theta_n^*)(T_n-\theta)^2} -g(\theta)\right) \\ \pause & = & \sqrt{n}\left(g^\prime(\theta)(T_n-\theta) + \frac{1}{2}g^{\prime\prime}(\theta_n^*)(T_n-\theta)^2 \right) \\ \pause & = & g^\prime(\theta) \, \sqrt{n}(T_n-\theta) \\ && ~~~+ \pause \frac{1}{2}g^{\prime\prime}(\theta_n^*) \cdot \sqrt{n}(T_n-\theta) \cdot (T_n-\theta) \\ \pause &\stackrel{d}{\rightarrow}& \pause g^\prime(\theta) T + 0 \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{A variance-stabilizing transformation} % Earlier years have an intense confidence interval. \framesubtitle{An application of the delta method} \pause \begin{itemize} \item Because the Poisson process is such a good model, count data often have approximate Poisson distributions. \pause \item Let $X_1, \ldots, X_n \stackrel{i.i.d}{\sim}$ Poisson$(\lambda)$ \pause \item $E(X_i)=Var(X_i)=\lambda$ \pause \item $Z_n = \frac{\sqrt{n}(\overline{X}_n-\lambda)}{\sqrt{\overline{X}_n}} \stackrel{d}{\rightarrow} Z \sim N(0,1)$ \pause \item Could say $\overline{X}_n \stackrel{\cdot}{\sim} N(\lambda,\lambda/n)$ \pause and $\sum_{i=1}^n X_i \stackrel{\cdot}{\sim} N(n\lambda,\lambda)$. \pause \item Because the sum of independent Poissons is Poisson, \pause this means Poisson-distributed variables with large $\lambda$ are approximately normal. \pause \item For analysis with normal linear models, approximate normality is good. \pause Variance that depends on $E(Y_i)$ is not good. \pause \item Can we fix it? \end{itemize} \end{frame} \begin{frame} \frametitle{Variance-stabilizing transformation continued} %\framesubtitle{} \begin{itemize} \item CLT says $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} T \sim N(0,\lambda)$. \pause \vspace{10mm} \item Delta method says \pause $\sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) \stackrel{d}{\rightarrow} g^\prime(\lambda) \, T \pause = Y \pause \sim N\left(0,g^\prime(\lambda)^2 \, \lambda\right)$ \pause \vspace{10mm} \item If $g^\prime(\lambda) = \frac{1}{\sqrt{\lambda}}$, then $Y \sim N(0,1)$. \end{itemize} \end{frame} \begin{frame} \frametitle{An elementary differential equation: $g^\prime(x) = \frac{1}{\sqrt{x}}$} \framesubtitle{Solve by separation of variables} \pause {\LARGE \begin{eqnarray*} & & \frac{dg}{dx} = x^{-1/2} \\ \pause & \Rightarrow & dg = x^{-1/2} \, dx\\ \pause & \Rightarrow & \int dg = \int x^{-1/2} \, dx\\ \pause & \Rightarrow & g(x) = \frac{x^{1/2}}{1/2} + c = 2 x^{1/2} + c \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{We have found} \pause \begin{eqnarray*} \sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) & = & \sqrt{n}\left( 2\overline{X}_n^{1/2}- 2\lambda^{1/2} \right) \\ \pause & \stackrel{d}{\rightarrow} & Z \sim N(0,1) \end{eqnarray*} \pause So, \begin{itemize} \item We could say that $\sqrt{\overline{X}_n}$ is asymptotically normal, \pause with (asymptotic) mean $\sqrt{\lambda}$ and (asymptotic) variance $\frac{1}{4n}$. \pause \item This calculation could justify a square root transformation for count data. \pause \item Note that the transformation is increasing, so if $Y_i$ is number of visitors to a website, $\sqrt{Y_i}$ could still be called ``popularity." \end{itemize} \end{frame} \begin{frame} \frametitle{The arcsin-square root transformation} \framesubtitle{For proportions} \pause Sometimes, variable values consist of proportions, one for each case. \pause \begin{itemize} \item For example, cases could be high schools. \pause \item The variable of interest is the proportion of students who enroll in university the year after graduation. \pause \item This is an example of \emph{aggregated data}. \end{itemize} \end{frame} \begin{frame} \frametitle{The advice you sometimes get} \framesubtitle{Still} \pause When a proportion is the response variable in a regression, use the \emph{arcsin square root} transformation. \pause \vspace{5mm} That is, if the proportions are $P_1, \ldots, P_n$, let \pause \begin{displaymath} Y_i = \sin^{-1}(\sqrt{P_i}) \end{displaymath} \pause and use the $Y_i$ values in your regression. \pause \vspace{5mm} \begin{center}{\huge \textbf{Why?}} \end{center} \pause \vspace{5mm} It's a variance-stabilizing transformation (details omitted). \end{frame} \begin{frame} \frametitle{That was fun, but it was all univariate.} \pause \pause Because \begin{itemize} \item The multivariate CLT establishes convergence to a multivariate normal, \pause and \item Vectors of MLEs are approximately multivariate normal for large samples, \pause and \item The multivariate delta method can yield the asymptotic distribution of useful functions of the MLE vector, \end{itemize} \vspace{15mm} \pause We need to look at random vectors and the multivariate normal distribution. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf18} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf18}} \end{frame} \end{document} \sin^{-1}\left(\sqrt{\overline{X}_m}\right) \sin^{-1}\left(\sqrt{\theta}\right) \begin{displaymath} \sqrt{m}\left( g(\overline{X}_m)- g(\theta) \right) \stackrel{d}{\rightarrow} Y \sim N\left(0,(g^\prime(\theta))^2\theta(1-\theta)\right). \end{displaymath} } CLT says $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} T \sim N(0,\lambda)$. $\sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) = \sqrt{n}\left( 2\sqrt{\overline{X}_n}- 2\sqrt{\lambda} \right) \stackrel{d}{\rightarrow} Z \sim N(0,1)$ \Pr\{-z < 2\sqrt{n}\left(\sqrt{\overline{X}_n}-\sqrt{\lambda}\right) < z \} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Power by simulation set.seed(9999) m = 10000 # Monte Carlo sample size theta=0.60; theta0 = 1/2; n = 100 Ybar = rbinom(m,size=n,prob=theta)/n # A vector of length m Z2 = sqrt(n)*(Ybar-theta0)/sqrt(Ybar*(1-Ybar)) # Another vector of length m power = length(Z2[abs(Z2>1.96)])/m; power # How about a 99 percent margin of error a = 0.005; z = qnorm(1-a) merror = z * sqrt(power*(1-power)/m); merror Lower = power - merror; Lower Upper = power + merror; Upper %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Notes and Comments: The 2012 version has multinomial. I cut this out to save time. Replaces the "hard elementary problem" with a power by simulation. There is much more detail on variance stabilizing transformations in earlier years. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%