% Large sample tools for grad SEM % Notes and comments are after the end of the document % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{euscript} % for \EuScript % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \usepackage{comment} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Large sample tools\footnote{See last slide for copyright information.}} \subtitle{STA2053 Fall 2022} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Foundations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sample Space $\Omega$, $\omega \in \Omega$} % \framesubtitle{Usually infinite in theory} \begin{itemize} \item $\Omega$ is a set, the underlying sample space. % \pause % \item It could literally be the universe of websites from which we intend to sample. \pause \item $\EuScript{F}$ is a class of subsets of $\Omega$. % \pause % \item If could be the class of all subsets (if $\Omega$ is countable). \pause \item There is a probability measure $\EuScript{P}$ defined on the elements of $\EuScript{F}$. % \pause % \item Maybe each website is equally likely to be chosen (with replacement). \end{itemize} \pause \vspace{5mm} \begin{center} {\Large Probability space $(\Omega,\EuScript{F},\EuScript{P})$} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Random variables are functions from $\Omega$ into the set of real numbers} \pause {\LARGE \begin{displaymath} Pr\{X \in B\} = Pr(\{\omega \in \Omega: X(\omega) \in B \}) \end{displaymath} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Random Sample $X_1(\omega), \ldots, X_n(\omega)$} \pause %\framesubtitle{} \begin{itemize} \item $T = T(X_1, \ldots, X_n)$ \pause \item $T = T_n(\omega)$ \pause \item Let $n \rightarrow \infty$ to see what happens for large samples. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{{\LARGE Modes of Convergence}} \pause %\framesubtitle{} {\LARGE \begin{itemize} \item Almost Sure Convergence \item Convergence in Probability \item Convergence in Distribution \end{itemize} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Almost Sure Convergence} We say that $T_n$ converges \emph{almost surely} to $T$, and write $T_n \stackrel{a.s.}{\rightarrow} T$ if \pause \begin{displaymath} Pr\{\omega:\, \lim_{n \rightarrow \infty} T_n(\omega) = T(\omega)\}=1. \end{displaymath} \pause \begin{itemize} \item Acts like an ordinary limit, except possibly on a set of probability zero. % \pause \item All the usual rules apply. \pause \item Called convergence with probability one or sometimes strong convergence. \pause \item In this course, convergence will usually be to a constant.% \pause \end{itemize} \begin{displaymath} Pr\{\omega:\, \lim_{n \rightarrow \infty} T_n(\omega) = c \}=1. \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{LLN} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Strong Law of Large Numbers} %\framesubtitle{} Let $X_1, \ldots, X_n$ be independent with common expected value $\mu$. \pause \vspace{10mm} {\huge \begin{displaymath} \overline{X}_n \stackrel{a.s.}{\rightarrow} E(X_i) = \mu \end{displaymath} } \pause The only condition required for this to hold is the existence of the expected value. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Probability is long run relative frequency} \pause \begin{itemize} \item Statistical experiment: Probability of ``success" is $\theta$. % \pause \item Carry out the experiment many times independently.% \pause \item Code the results $X_i=1$ if success, $X_i=0$ for failure, $i = 1, 2, \ldots$ \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sample proportion of successes converges to the probability of success} \framesubtitle{Recall $X_i=0$ or $1$.} \pause {\Large \begin{eqnarray*} E(X_i) &=& \sum_{x=0}^1 x \, Pr\{X_i = x\} \\ % \pause % &&\\ &=& 0\cdot (1-\theta) + 1\cdot \theta \\ % \pause % &&\\ &=& \theta \end{eqnarray*} } \pause Relative frequency is {\Large \begin{displaymath} \frac{1}{n}\sum_{i=1}^n X_i \pause = \overline{X}_n % \pause \stackrel{a.s.}{\rightarrow} \theta \end{displaymath} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Simulation} \framesubtitle{Using pseudo-random number generation by computer} \pause \begin{itemize} \item Estimate almost any probability that's hard to figure out. \pause \item Statistical power \pause \item Weather model \pause \item Performance of statistical methods \item[] \pause \item Tests or confidence intervals for estimated probabilities. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} \begin{center} {\Large Back to the Law of Large Numbers} \end{center} %\framesubtitle{} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Recall the Change of Variables formula: Let $Y = g(X)$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} E(Y) = \int_{-\infty}^\infty y \, f_{_Y}(y) \, dy \pause = \int_{-\infty}^\infty g(x) \, f_{_X}(x) \, dx \end{displaymath} } \pause Or, for discrete random variables % \pause {\LARGE \begin{displaymath} E(Y) = \sum_y y \, p_{_Y}(y) = \sum_x g(x) \, p_{_X}(x) \end{displaymath} } \pause This is actually a big theorem, not a definition. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Applying the change of variables formula} \framesubtitle{To approximate $E[g(X)]$} \pause Simulate $X_1, \ldots, X_n$ from the distribution of $X$. \pause Calculate {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n g(X_i) \pause &=& \frac{1}{n}\sum_{i=1}^n Y_i \pause \stackrel{a.s.}{\rightarrow} E(Y) \\ \\ \pause &=& E(g(X)) \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{So for example} %\framesubtitle{} {\LARGE \begin{eqnarray*} \frac{1}{n}\sum_{i=1}^n X_i^k &\stackrel{a.s.}{\rightarrow}& E(X^k) \\ &&\\ \pause \frac{1}{n}\sum_{i=1}^n U_i^2 V_i W_i^3 &\stackrel{a.s.}{\rightarrow}& E(U^2VW^3) \end{eqnarray*} } \pause \vspace{5mm} That is, sample moments converge almost surely to population moments. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Approximate an integral: $\int_{-\infty}^{\infty} h(x) \, dx$} \framesubtitle{Where $h(x)$ is a nasty function.} \pause Let $f(x)$ be a density with $f(x)>0$ wherever $h(x)\neq 0$. \pause \begin{eqnarray*} \int_{-\infty}^{\infty} h(x) \, dx \pause & = & \int_{-\infty}^{\infty} \frac{h(x)}{f(x)} f(x) \, dx \\ \pause & = & E\left[ \frac{h(X)}{f(X)}\right] \\ \pause & = & E[g(X)], \end{eqnarray*} \pause So \begin{itemize} \item Sample $X_1, \ldots, X_n$ from the distribution with density $f(x)$ \pause \item Calculate $Y_i = g(X_i) = \frac{h(X_i)}{f(X_i)}$ for $i=1, \ldots, n$ \pause \item Calculate $\overline{Y}_n \stackrel{a.s.}{\rightarrow} E[Y]= E[g(X)]$ \pause \item Confidence interval for $\mu = E[Y]$ is routine. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Convergence in Probability} We say that $T_n$ converges \emph{in probability} to $T$, % \pause and write $T_n \stackrel{P}{\rightarrow} T$ % \pause if for all $\epsilon>0$, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{\omega: |T_n(\omega)-T(\omega)| < \epsilon \}=1 \end{displaymath} } \pause For us, convergence will usually be to a constant: {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|T_n-c|<\epsilon \}=1 \end{displaymath} } \pause Convergence in probability (say to $c$) means no matter how small the interval around $c$, for large enough $n$ (that is, for all $n>N_1$) the probability of getting that close to $c$ is as close to one as you like. \pause We will seldom use the definition in this class. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Weak Law of Large Numbers} {\huge \begin{displaymath} \overline{X}_n \stackrel{p}{\rightarrow} \mu \end{displaymath} } \pause \begin{itemize} \item Almost Sure Convergence implies Convergence in Probability \item Strong Law of Large Numbers implies Weak Law of Large Numbers \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Consistency} \begin{frame} \frametitle{Consistency} \framesubtitle{$T = T(X_1, \ldots, X_n)$ is a statistic estimating a parameter $\theta$} \pause The statistic $T_n$ is said to be \emph{consistent} for $\theta$ if $T_n \stackrel{P}{\rightarrow} \theta$ \pause for all $\theta$ in the parameter space. \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} P\{|T_n-\theta|<\epsilon \}=1 \end{displaymath} } \pause \vspace{5mm} The statistic $T_n$ is said to be \emph{strongly consistent} for $\theta$ if $T_n \stackrel{a.s.}{\rightarrow} \theta$. \pause \vspace{5mm} Strong consistency implies ordinary consistency. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Consistency is great but it's not enough.} \pause {\LARGE \begin{displaymath} T_n \stackrel{a.s.}{\rightarrow} \theta \pause \Rightarrow U_n = T_n + \frac{100,000,000}{n} \pause \stackrel{a.s.}{\rightarrow} \theta \end{displaymath} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Consistency of the Sample Variance } \pause %{\LARGE \begin{eqnarray*} \widehat{\sigma}^2_n &=& \frac{1}{n}\sum_{i=1}^n (X_i-\overline{X})^2 \\ \\ \pause &=& \frac{1}{n}\sum_{i=1}^n X_i^2 - \overline{X}^2 \end{eqnarray*} %} \pause \vspace{5mm} By SLLN, $\overline{X}_n \stackrel{a.s.}{\rightarrow}\mu$ and $\frac{1}{n}\sum_{i=1}^n X_i^2 \stackrel{a.s.}{\rightarrow} E(X^2) = \sigma^2+\mu^2$. \pause \vspace{5mm} Because the function $g(x,y)=x-y^2$ is continuous, \pause \vspace{5mm} \begin{displaymath} \widehat{\sigma}^2_n = g\left(\frac{1}{n}\sum_{i=1}^n X_i^2,\overline{X}_n\right) \pause \stackrel{a.s.}{\rightarrow} g(\sigma^2+\mu^2,\mu) \pause = \sigma^2+\mu^2 - \mu^2 = \pause \sigma^2 \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{CLT} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Convergence in Distribution} \framesubtitle{Sometimes called \emph{Weak Convergence}, or \emph{Convergence in Law}} \pause Denote the cumulative distribution functions of $T_1, T_2, \ldots$ by $F_1(t), F_2(t), \ldots$ respectively, and denote the cumulative distribution function of $T$ by $F(t)$. \pause \vspace{5mm} We say that $T_n$ converges \emph{in distribution} to $T$, and write $T_n \stackrel{d}{\rightarrow} T$ if \pause for every point $t$ at which $F$ is continuous, \pause {\LARGE \begin{displaymath} \lim_{n \rightarrow \infty} F_n(t) = F(t) \end{displaymath} } \pause Again, we will seldom use this definition directly. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Univariate Central Limit Theorem} Let $X_1, \ldots, X_n$ be a random sample from a distribution with expected value $\mu$ and variance $\sigma^2$. Then \pause {\LARGE \begin{displaymath} Z_n = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \stackrel{d}{\rightarrow} Z \sim N(0,1) \end{displaymath} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Connections among the Modes of Convergence} \pause {\LARGE \begin{itemize} \item $ T_n \stackrel{a.s.}{\rightarrow} T \Rightarrow T_n \stackrel{p}{\rightarrow} T \Rightarrow T_n \stackrel{d}{\rightarrow} T $. \pause \vspace{5mm} \item If $a$ is a constant, %\pause $ T_n \stackrel{d}{\rightarrow} a \Rightarrow T_n \stackrel{p}{\rightarrow} a$. \end{itemize} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sometimes we say the distribution of the sample mean is approximately normal, or asymptotically normal.} \pause %\framesubtitle{} \begin{itemize} \item This is justified by the Central Limit Theorem. \pause \item But it does \emph{not} mean that $\overline{X}_n$ converges in distribution to a normal random variable. \pause \item The Law of Large Numbers says that $\overline{X}_n$ converges almost surely (and in probability) to a constant, $\mu$. \pause \item So $\overline{X}_n$ converges to $\mu$ in distribution as well. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Why would we say that for large $n$, the sample mean is approximately $N(\mu,\frac{\sigma^2}{n})$?} \pause \vspace{5mm} Have $Z_n = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \pause \stackrel{d}{\rightarrow} Z \sim N(0,1)$. \pause {\footnotesize \begin{eqnarray*} Pr\{\overline{X}_n \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(\overline{X}_n-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ Z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause \approx \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } \pause Suppose $Y$ is \emph{exactly} $N(\mu,\frac{\sigma^2}{n})$: \pause {\footnotesize \begin{eqnarray*} Pr\{Y \leq x\} \pause & = & Pr\left\{ \frac{\sqrt{n}(Y-\mu)}{\sigma} \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \\ \pause & = & Pr\left\{ Z_n \leq \frac{\sqrt{n}(x-\mu)}{\sigma}\right\} \pause = \Phi\left( \frac{\sqrt{n}(x-\mu)}{\sigma} \right) \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Convergence of random vectors} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[allowframebreaks] % Continue frame onto several slides. Pause does not seem to work. \frametitle{Convergence of random vectors} {\footnotesize \begin{enumerate} \item Definitions (All quantities in boldface are vectors in $\mathbb{R}^m$ unless otherwise stated ) \begin{enumerate} \item[$\star$] $ \mathbf{T}_n \stackrel{a.s.}{\rightarrow} \mathbf{T}$ means $P\{\omega:\, \lim_{n \rightarrow \infty} \mathbf{T}_n(\omega) = \mathbf{T}(\omega)\}=1$. \item[$\star$] $ \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ means $\forall \epsilon>0,\,\lim_{n \rightarrow \infty} P\{||\mathbf{T}_n-\mathbf{T}||<\epsilon \}=1$. \item[$\star$] $ \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ means for every continuity point $\mathbf{t}$ of $F_\mathbf{T}$, $\lim_{n \rightarrow \infty}F_{\mathbf{T}_n}(\mathbf{t}) = F_\mathbf{T}(\mathbf{t})$. \end{enumerate} \item $ \mathbf{T}_n \stackrel{a.s.}{\rightarrow} \mathbf{T} \Rightarrow \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T} \Rightarrow \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T} $. \item \label{dsop} If $\mathbf{a}$ is a vector of constants, $ \mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{a} \Rightarrow \mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{a}$. \item Strong Law of Large Numbers (SLLN): Let $\mathbf{X}_1, \ldots \mathbf{X}_n$ be independent and identically distributed random vectors with finite first moment, and let $\mathbf{X}$ be a general random vector from the same distribution. Then $ \overline{\mathbf{X}}_n \stackrel{a.s.}{\rightarrow} E(\mathbf{X})$. \item Central Limit Theorem: Let $\mathbf{X}_1, \ldots, \mathbf{X}_n$ be i.i.d. random vectors with expected value vector $\boldsymbol{\mu}$ and covariance matrix $\boldsymbol{\Sigma}$. Then $\sqrt{n}(\overline{\mathbf{X}}_n-\boldsymbol{\mu})$ converges in distribution to a multivariate normal with mean \textbf{0} and covariance matrix $\boldsymbol{\Sigma}$. \framebreak \item \label{slutd} Slutsky Theorems for Convergence in Distribution: \begin{enumerate} \item \label{slutcond} If $\mathbf{T}_n \in \mathbb{R}^m$, $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and if $f:\,\mathbb{R}^m \rightarrow \mathbb{R}^q$ (where $q \leq m$) is continuous except possibly on a set $C$ with $P(\mathbf{T} \in C)=0$, then $f(\mathbf{T}_n) \stackrel{d}{\rightarrow} f(\mathbf{T})$. \item \label{slutdiffd} If $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and $(\mathbf{T}_n - \mathbf{Y}_n) \stackrel{P}{\rightarrow} 0$, then $\mathbf{Y}_n \stackrel{d}{\rightarrow} \mathbf{T}$. \item \label{slutstackd} If $\mathbf{T}_n \in \mathbb{R}^d$, $\mathbf{Y}_n \in \mathbb{R}^k$, $\mathbf{T}_n \stackrel{d}{\rightarrow} \mathbf{T}$ and $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{c}$, then \begin{displaymath} \left( \begin{array}{cc} \mathbf{T}_n \\ \mathbf{Y}_n \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{cc} \mathbf{T} \\ \mathbf{c} \end{array} \right) \end{displaymath} \end{enumerate} \framebreak \item \label{slutp} Slutsky Theorems for Convergence in Probability: \begin{enumerate} \item \label{slutconp} If $\mathbf{T}_n \in \mathbb{R}^m$, $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and if $f:\,\mathbb{R}^m \rightarrow \mathbb{R}^q$ (where $q \leq m$) is continuous except possibly on a set $C$ with $P(\mathbf{T} \in C)=0$, then $f(\mathbf{T}_n) \stackrel{P}{\rightarrow} f(\mathbf{T})$. \item \label{slutdiffp} If $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and $(\mathbf{T}_n - \mathbf{Y}_n) \stackrel{P}{\rightarrow} 0$, then $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{T}$. \item \label{slutstackp} If $\mathbf{T}_n \in \mathbb{R}^d$, $\mathbf{Y}_n \in \mathbb{R}^k$, $\mathbf{T}_n \stackrel{P}{\rightarrow} \mathbf{T}$ and $\mathbf{Y}_n \stackrel{P}{\rightarrow} \mathbf{Y}$, then \begin{displaymath} \left( \begin{array}{cc} \mathbf{T}_n \\ \mathbf{Y}_n \end{array} \right) \stackrel{P}{\rightarrow} \left( \begin{array}{cc} \mathbf{T} \\ \mathbf{Y} \end{array} \right) \end{displaymath} \end{enumerate} \framebreak \item \label{delta} Delta Method (Theorem of Cram\'{e}r, Ferguson p. 45): Let $g: \mathbb{R}^d \rightarrow \mathbb{R}^k$ be such that the elements of \.{g}$(\mathbf{x}) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ are continuous in a neighborhood of $\boldsymbol{\theta} \in \mathbb{R}^d$. If $\mathbf{T}_n$ is a sequence of $d$-dimensional random vectors such that $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T}$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{T}$. In particular, if $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N(\mathbf{0},\mathbf{\Sigma})$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\theta})\mathbf{\Sigma}\mbox{\.{g}}(\boldsymbol{\theta}) ^\prime)$. \end{enumerate} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{An application of the Slutsky Theorems} \begin{itemize} \item Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim}\,?(\mu,\sigma^2)$ \pause \item By CLT, $Y_n = \sqrt{n}(\overline{X}_n-\mu) \stackrel{d}{\rightarrow} Y \sim N(0,\sigma^2)$ \pause \item Let $\widehat{\sigma}_n$ be \emph{any} consistent estimator of $\sigma$. \pause \item Then by \ref{slutd}.\ref{slutstackd}, $\mathbf{T}_n = \left( \begin{array}{cc} Y_n \\ \widehat{\sigma}_n \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{cc} Y \\ \sigma \end{array} \right) = \mathbf{T} $ \pause \item The function $f(x,y)=x/y$ is continuous except if $y=0$ \\ so by \ref{slutd}.\ref{slutcond}, \pause \end{itemize} \begin{displaymath} f(\mathbf{T}_n) = \frac{\sqrt{n}(\overline{X}_n-\mu)}{\widehat{\sigma}_n} \pause \stackrel{d}{\rightarrow} f(\mathbf{T}) \pause = \frac{Y}{\sigma} \pause \sim N(0,1) \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another application: Asymptotic normality of the sample variance} \begin{itemize} \item Let $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim}\,?(\mu,\sigma^2)$, and $\widehat{\sigma}^2_n = \frac{1}{n}\sum_{i=1}^n(X_i-\overline{X}_n)^2$. \pause \item Want to show $\sqrt{n}\left( \widehat{\sigma}^2_n -\sigma^2\right)$ converges to a normal. \pause \item Substitute $\mu$ for $\overline{X}_n$? Look at $\frac{1}{n}\sum_{i=1}^n(X_i-\mu)^2$? \pause \item If so, it's easy. \begin{itemize} \item Let $Y_i = (X_i-\mu)^2$ \pause \item $E(Y_i) = \sigma^2$ \pause \item $Var(Y_i) = E(Y_i^2) - \left(E(Y_i)\right)^2 \pause = E(X_i-\mu)^4-\sigma^4 \pause = \sigma^2_y$. \pause \item $\overline{Y}_n = \frac{1}{n}\sum_{i=1}^n(X_i-\mu)^2$ \pause \item By CLT, $\sqrt{n}\left( \overline{Y}_n -\sigma^2 \right) \stackrel{d}{\rightarrow} Y \sim N(0,\sigma^2_y)$. \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Show $\sqrt{n}\left( \widehat{\sigma}^2_n - \sigma^2 \right) - \sqrt{n}\left( \overline{Y}_n -\sigma^2 \right) \stackrel{p}{\rightarrow} 0$} \framesubtitle{See \ref{slutd}.\ref{slutdiffd}} \pause \begin{eqnarray*} \widehat{\sigma}^2_n & = & \frac{1}{n}\sum_{i=1}^n(X_i-\overline{X}_n)^2 \\ & = & \frac{1}{n}\sum_{i=1}^n(X_i {\color{red} -\mu+\mu } -\overline{X}_n)^2 \\ \pause & = & \frac{1}{n}\sum_{i=1}^n \left[(X_i-\mu)^2 + 2(X_i-\mu)(\mu-\overline{X}_n) + (\mu-\overline{X}_n)^2 \right] \\ \pause &&\\ & = & \ldots \\ \pause & = & \frac{1}{n}\sum_{i=1}^n(X_i-\mu)^2 - (\overline{X}_n-\mu)^2 \\ \pause & = & \overline{Y}_n - (\overline{X}_n-\mu)^2 \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Using $\widehat{\sigma}^2_n = \overline{Y}_n - (\overline{X}_n-\mu)^2$} %\framesubtitle{} {\footnotesize \begin{eqnarray*} \sqrt{n}\left( \overline{Y}_n - \sigma^2 \right) - \sqrt{n}\left( \widehat{\sigma}^2_n -\sigma^2 \right) & = & \pause \sqrt{n}\left( \overline{Y}_n - {\color{red}\widehat{\sigma}^2_n} \right) \\ \pause & = & \sqrt{n}\left( \overline{Y}_n - {\color{red}(\overline{Y}_n - (\overline{X}_n-\mu)^2)} \right) \\ \pause & = & \sqrt{n}\left( \overline{X}_n-\mu\right)^2 \\ \pause & = & \sqrt{n}(\overline{X}_n-\mu) \,\cdot\, (\overline{X}_n-\mu)\\ \pause \end{eqnarray*} \begin{itemize} \item First term goes in distribution to $X \sim N(0,\sigma^2)$ by CLT. \pause \item Second term goes to zero in probability by LLN. \pause \item $\left(\begin{array}{c} \sqrt{n}(\overline{X}_n-\mu) \\ \overline{X}_n-\mu \end{array}\right) \stackrel{d}{\rightarrow} \left(\begin{array}{c} X \\ 0 \end{array}\right)$ by \ref{slutd}.\ref{slutstackd}. \pause \item By continuous mapping \ref{slutd}.\ref{slutcond}, $\sqrt{n}(\overline{X}_n-\mu) \,\cdot\, (\overline{X}_n-\mu) \stackrel{d}{\rightarrow} X\cdot 0 = 0$ \pause \item Convergence in distribution to a constant implies convergence in probability (Rule~\ref{dsop})\pause, so the difference converges in probability to zero, and the result follows by \ref{slutd}.\ref{slutdiffd} \hspace{4mm} $\blacksquare$ \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Result} %\framesubtitle{} \begin{itemize} \item Because the difference between $\sqrt{n}\left( \widehat{\sigma}^2_n -\sigma^2 \right)$ and $\sqrt{n}\left( \frac{1}{n}\sum_{i=1}^n(X_i-\mu)^2 -\sigma^2 \right)$ goes to zero in probability, they converge in distribution to the same target. \pause \item Target is $N(0,\sigma^2_y)$ \pause \item $\sigma^2_y = E(X_i-\mu)^4-\sigma^4$. \end{itemize} \end{frame} \begin{comment} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Delta Method} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Univariate delta method} In the multivariate Delta Method~\ref{delta}, the matrix $\mbox{\.{g}}(\boldsymbol{\theta})$ is a Jacobian. The univariate version % of the delta method says that \pause \vspace{3mm} If $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T$ and $g^{\prime\prime}(x)$ is continuous in a neighbourhood of $\theta$, then \begin{displaymath} \sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T. \end{displaymath} \vspace{5mm} \pause When using the Central Limit Theorem, \emph{especially} if there is a $\theta \neq \mu$ in the model, it's safer to write \begin{displaymath} \sqrt{n}\left( g(\overline{X}_n)- g(\mu) \right) \stackrel{d}{\rightarrow} g^\prime(\mu) \, T. \end{displaymath} and then substitute for $\mu$ in terms of $\theta$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Delta Method Example} %\framesubtitle{} \begin{itemize} \item[] $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} \mbox{Poisson}(\lambda)$ \pause \item[] $E(X_i)=Var(X_i) = \lambda$ \pause \item[] $\frac{\sqrt{n}(\overline{X}_n-\lambda)}{\sqrt{\overline{X}_n}} \stackrel{d}{\rightarrow} Z_1 \sim N(0,1)$ \pause \item[] Confidence interval $\left(\overline{X}_n - z_{\alpha/2} \sqrt{\frac{\overline{X}_n}{n}}~,~ \overline{X}_n + z_{\alpha/2} \sqrt{\frac{\overline{X}_n}{n}}\right)$ \pause \item[] \item[] Maybe we can do better. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Delta Method says $ \sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T$} %\framesubtitle{} \begin{itemize} \item[] $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} X \sim N(0,\lambda)$. \pause \item[] $ \sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) \pause \stackrel{d}{\rightarrow} g^\prime(\lambda) \, X \sim N(0,g^\prime(\lambda)^2\lambda)$ \pause \item Choose $g$ to make the variance not depend on $\lambda$. \pause \item How about $g(\lambda) = 2\sqrt{\lambda}$ \pause \item[] $g^\prime(\lambda) = 2 \, \frac{1}{2}\lambda^{-1/2} = \pause \frac{1}{\sqrt{\lambda}}$. \pause \item Variance of the target is $g^\prime(\lambda)^2\lambda = 1$. \pause \item[] So, \end{itemize} \begin{equation*} \sqrt{n}\left(2\sqrt{\overline{X}_n}-2\sqrt{\lambda}\right) \stackrel{d}{\rightarrow} Z_2 \sim N(0,1). \end{equation*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\sqrt{n}\left(2\sqrt{\overline{X}_n}-2\sqrt{\lambda}\right) \stackrel{d}{\rightarrow} Z_2 \sim N(0,1)$} %\framesubtitle{} %{\LARGE \begin{eqnarray*} 0.95 & \approx & P\left\{ -z_{\alpha/2} < \sqrt{n}\left(2\sqrt{\overline{X}_n}-2\sqrt{\lambda}\right) < z_{\alpha/2} \right\} \\ \pause & = & P\left\{ -\frac{z_{\alpha/2}}{2\sqrt{n}} < \sqrt{\overline{X}_n}-\sqrt{\lambda} < \frac{z_{\alpha/2}}{2\sqrt{n}} \right\} \\ \pause & = & P\left\{ \sqrt{\overline{X}_n}-\frac{z_{\alpha/2}}{2\sqrt{n}} < \sqrt{\lambda} < \sqrt{\overline{X}_n}+\frac{z_{\alpha/2}}{2\sqrt{n}} \right\} \\ \pause & = & P\left\{ \left(\sqrt{\overline{X}_n}-\frac{z_{\alpha/2}}{2\sqrt{n}}\right)^2 < \lambda < \left(\sqrt{\overline{X}_n}+\frac{z_{\alpha/2}}{2\sqrt{n}}\right)^2 \right\}. \end{eqnarray*} \pause %} % End size Compare $P\left\{\overline{X}_n - z_{\alpha/2} \sqrt{\frac{\overline{X}_n}{n}} < \lambda < \overline{X}_n + z_{\alpha/2} \sqrt{\frac{\overline{X}_n}{n}}\right\}$ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The delta method comes from Taylor's Theorem} \pause \textbf{Taylor's Theorem}: Let the $n$th derivative $f^{(n)}$ be continuous in $[a,b]$ and differentiable in $(a,b)$, with $x$ and $x_0$ in $(a,b)$. Then there exists a point $\xi$ between $x$ and $x_0$ such that \pause \begin{eqnarray*} f(x) & = & f(x_0) \;+\; f^\prime(x_0)\,(x-x_0) \;+\; \frac{f^{\prime\prime}(x_0)(x-x_0)^2}{2!} \;+\; \ldots \\ & + & \frac{f^{(n)}(x_0)(x-x_0)^n}{n!} \;+\; \frac{f^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!} \end{eqnarray*} \pause where $R_n = \frac{f^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!}$ is called the \emph{remainder term}. \pause If $R_n \rightarrow 0$ as $n \rightarrow \infty$, \pause the resulting infinite series is called the \emph{Taylor Series} for $f(x)$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Taylor's Theorem with two terms plus remainder} \framesubtitle{Very common in applications} \pause Let $g(x)$ be a function for which $g^{\prime\prime}(x)$ is continuous in an open interval containing $x=\theta$. \pause Then {\Large \begin{displaymath} g(x) = \pause g(\theta) + g^\prime(\theta)(x-\theta) + \frac{g^{\prime\prime}(\theta^*)(x-\theta)^2}{2!} \end{displaymath} \pause } % End size where $\theta^*$ is between $x$ and $\theta$. \end{frame} \begin{frame} \frametitle{Delta method} \framesubtitle{Using $g(x) = g(\theta) + g^\prime(\theta)(x-\theta) + \frac{1}{2}g^{\prime\prime}(\theta^*)(x-\theta)^2$} \pause Let $\sqrt{n}(T_n-\theta) \stackrel{d}{\rightarrow} T$ \pause so that $T_n \stackrel{p}{\rightarrow} \theta$. \pause {\footnotesize \begin{eqnarray*} \sqrt{n}\left({\color{red}g(T_n)}-g(\theta)\right) \pause & = & \sqrt{n}\left( {\color{red} g(\theta) + g^\prime(\theta)(T_n-\theta) + \frac{1}{2}g^{\prime\prime}(\theta_n^*)(T_n-\theta)^2} -g(\theta)\right) \\ \pause & = & \sqrt{n}\left(g^\prime(\theta)(T_n-\theta) + \frac{1}{2}g^{\prime\prime}(\theta_n^*)(T_n-\theta)^2 \right) \\ \pause & = & g^\prime(\theta) \, \sqrt{n}(T_n-\theta) \\ && ~~~+ \pause \frac{1}{2}g^{\prime\prime}(\theta_n^*) \cdot \sqrt{n}(T_n-\theta) \cdot (T_n-\theta) \\ \pause &\stackrel{d}{\rightarrow}& \pause g^\prime(\theta) T + 0 \end{eqnarray*} } % End size \end{frame} % Need a homely example \begin{frame} \frametitle{That was fun, but it was all univariate.} % \pause \begin{itemize} \item The multivariate CLT establishes convergence to a multivariate normal. % \pause \item Vectors of MLEs are approximately multivariate normal for large samples. % \pause \item The multivariate delta method can yield the asymptotic distribution of useful functions of the MLE vector. \end{itemize} \vspace{15mm} % \pause We need to look at random vectors and the multivariate normal distribution. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/2053f22} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/2053f22}} \end{frame} \end{document} \sin^{-1}\left(\sqrt{\overline{X}_m}\right) \sin^{-1}\left(\sqrt{\theta}\right) \begin{displaymath} \sqrt{m}\left( g(\overline{X}_m)- g(\theta) \right) \stackrel{d}{\rightarrow} Y \sim N\left(0,(g^\prime(\theta))^2\theta(1-\theta)\right). \end{displaymath} } CLT says $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} T \sim N(0,\lambda)$. $\sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) = \sqrt{n}\left( 2\sqrt{\overline{X}_n}- 2\sqrt{\lambda} \right) \stackrel{d}{\rightarrow} Z \sim N(0,1)$ \Pr\{-z < 2\sqrt{n}\left(\sqrt{\overline{X}_n}-\sqrt{\lambda}\right) < z \} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} }