% Improved in 2016: Percentile method and symmetry, but cut MV delta method % \documentclass[serif]{beamer} % Get Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{amsmath} % For \binom{n}{y} %\usepackage{graphicx} % To include pdf files! %\usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{The Bootstrap\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} % Background reading slide deleted (saved after end of documment). Put actual Efron references. \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Sampling distributions} \begin{frame} \frametitle{Sampling distributions} \pause %\framesubtitle{} \begin{itemize} \item Let $\mathbf{x} = (X_1, \ldots, X_n)$ be a random sample from some distribution $F$. \pause \item $T=T(\mathbf{x})$ is a statistic (could be a vector of statistics). \pause \item Need to know about the distribution of $T$. \pause \item Sometimes it's not easy, even asymptotically. \end{itemize} \end{frame} \begin{frame} \frametitle{Sampling distribution of $T$: The elementary version} \framesubtitle{For example $T = \overline{X}$} \pause \begin{itemize} \item Sample repeatedly from this population (pretend). \pause \item For each sample, calculate $T$. \pause \item Make a relative frequency histogram of the $T$ values you observe. \pause \item As the number of samples becomes very large, the histogram approximates the distribution of $T$. \end{itemize} \end{frame} \section{Bootstrap} \begin{frame} \frametitle{What is a bootstrap?} \framesubtitle{Pull yourself up by your bootstraps} \begin{center} \includegraphics[width=2.5in]{Dr_Martens,_black,_old.jpg} \end{center} {\scriptsize This photograph was taken by Tarquin. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. For more information, see the entry at the \href{http://commons.wikimedia.org/wiki/File:Dr_Martens,_black,_old.jpg} {wikimedia site}. } % End size \end{frame} \begin{frame} \frametitle{The (statistical) Bootstrap} \framesubtitle{Bradley Efron, 1979} \pause \begin{itemize} \item Select a random sample from the population. \pause \item If the sample size is large, the sample is similar to the population. \pause \item Sample repeatedly from the sample. This is called \emph{resampling}. \pause \item Sample from the sample? Think of putting the sample data values in a jar \ldots \pause \item Calculate the statistic for every bootstrap sample. \pause \item A histogram of the resulting values approximates the shape of the sampling distribution of the statistic. \end{itemize} \end{frame} \begin{frame} \frametitle{Notation} %\framesubtitle{} \begin{itemize} \item Let $\mathbf{x} = (X_1, \ldots, X_n)$ be a random sample from some distribution $F$. \pause \item $T=T(\mathbf{x})$ is a statistic (could be a vector of statistics). \pause \item Form a ``bootstrap sample" $\mathbf{x}^*$ by sampling $n$ values from $\mathbf{x}$ \emph{with replacement}. \pause \item Repeat this process $B$ times, obtaining $\mathbf{x}^*_1, \ldots, \mathbf{x}^*_B$. \pause \item Calculate the statistic for each bootstrap sample, obtaining $T^*_1, \ldots, T^*_B$. \pause \item Relative frequencies of $T^*_1, \ldots, T^*_B$ approximate the sampling distribution of $T$. \end{itemize} \end{frame} \begin{frame} \frametitle{Why does it work?} \pause \begin{displaymath} \widehat{F}(x) = \frac{1}{n}\sum_{i=1}^nI\{X_i \leq x\} \pause \stackrel{a.s.}{\rightarrow} E(I\{X_i \leq x \}) \pause = F(x) \end{displaymath} \pause \begin{itemize} \item Resampling from $\mathbf{x}$ with replacement is the same as simulating a random variable whose distribution is the empirical distribution function $\widehat{F}(x)$. \pause \item Suppose the distribution function of $T$ is a nice smooth function of $F$. \pause \item Then as $n\rightarrow\infty$ and $B\rightarrow\infty$, bootstrap sample moments and quantiles of $T^*_1, \ldots, T^*_B$ converge to the corresponding moments and quantiles of the distribution of $T$. \pause \item If the distribution of $\mathbf{x}$ is discrete and supported on a finite number of points, the technical issues are minor. \end{itemize} \end{frame} \begin{frame} \frametitle{Quantile Bootstrap Confidence Intervals} \pause % \framesubtitle{} \begin{itemize} \item Suppose $T_n$ is a consistent estimator of $g(\theta)$. \pause \item And the distribution of $T_n$ is approximately symmetric around $g(\theta)$. \pause \item Then the lower $(1-\alpha)100\%$ confidence limit for $g(\theta)$ is the $\alpha/2$ sample quantile of $T^*_1, \ldots, T^*_B$, \pause and the upper limit is the $1-\alpha/2$ sample quantile. \pause \item For example, the 95\% confidence interval ranges from the 2.5th to the 97.5th percentile \pause of $T^*_1, \ldots, T^*_B$. \end{itemize} \end{frame} \begin{frame} \frametitle{Symmetry} \framesubtitle{A requirement that is often ignored} \begin{center} \includegraphics[width=4.5in]{Symmetric} \end{center} \pause \vspace{5mm} The distribution of $T$ symmetric about $\theta$ means for all $d>0$, $P\{T>\theta + d \} = P\{T<\theta - d \}$. \end{frame} % # Raise it up so the symbols will be inside the plotting area. % x = seq(from=-4,to=4,by=0.01); y = dnorm(x)+1 % plot(x,y,type='l',bty='n',xaxt='n',yaxt='n',xlab=' ',ylab=' ',ylim=c(0,2)) % lines(c(-4,4),c(1,1)); lines(c(0,0),c(1,dnorm(0)+1)) % lines(c(-1.5,-1.5),c(1,dnorm(-1.5)+1)); lines(c(1.5,1.5),c(1,dnorm(1.5)+1)) % text(0,.95,expression(theta)) % text(-1.5,.95,expression(theta-d)); text(1.5,.95,expression(theta+d)) \begin{frame} \frametitle{Why Symmetry?} % \framesubtitle{} \begin{center} \includegraphics[width=3in]{Symmetric} \end{center} \pause \begin{itemize} \item The distribution of $T$ symmetric about $\theta$ means for all $d>0$, $P\{T>\theta + d \} = P\{T<\theta - d \}$. \pause \item Select $d$ so that the probability equals $\alpha/2$. \pause \end{itemize} \begin{eqnarray*} 1-\alpha & = & P\{\theta-d < T < \theta+d \} \\ \pause & = & P\{T-d < \theta < T+d \} \pause \end{eqnarray*} Need to estimate $d$. \end{frame} \begin{frame} \frametitle{Estimating $d$} \framesubtitle{There are two natural estimates} \begin{center} \includegraphics[width=4in]{Symmetric} \end{center} \begin{displaymath} 1-\alpha = P\{\theta-d < T < \theta+d \} \pause = P\{Q_{1-\alpha/2} < T < Q_{\alpha/2} \} \end{displaymath} \pause \begin{eqnarray*} \widehat{\theta}-\widehat{d}_1 = \widehat{Q}_{\alpha/2} & \Rightarrow & \pause \widehat{d}_1 = T - \widehat{Q}_{\alpha/2} \\ \pause \widehat{\theta}+\widehat{d}_2 = \widehat{Q}_{1-\alpha/2} & \Rightarrow & \pause \widehat{d}_2 = \widehat{Q}_{1-\alpha/2} - T \\ \pause \end{eqnarray*} I would average them: \begin{displaymath} \widehat{d} = \frac{1}{2}(\widehat{d}_1+\widehat{d}_2) \pause = \frac{1}{2}(\widehat{Q}_{1-\alpha/2}-\widehat{Q}_{\alpha/2}) \end{displaymath} \end{frame} \begin{frame} \frametitle{$1-\alpha = P\{T-d < \theta < T+d \}$ } \framesubtitle{Plug in an estimate of $d$} \pause \begin{itemize} \item $\widehat{d}_1 = T - \widehat{Q}_{\alpha/2}$ \item $\widehat{d}_2 = \widehat{Q}_{1-\alpha/2} - T$ \item $\widehat{d} = \frac{1}{2}(\widehat{d}_1+\widehat{d}_2)$ \end{itemize} \pause \vspace{3mm} Using $\widehat{d}_1$ on the left yields \pause \begin{displaymath} T - \widehat{d}_1 \pause = T - (T - \widehat{Q}_{\alpha/2}) \pause = \widehat{Q}_{\alpha/2} \end{displaymath} \pause Using $\widehat{d}_2$ on the right yields \pause \begin{displaymath} T + \widehat{d}_2 \pause = T + (\widehat{Q}_{1-\alpha/2} - T) \pause = \widehat{Q}_{1-\alpha/2} \pause , \end{displaymath} which is the quantile confidence interval. \end{frame} \begin{frame} \frametitle{Maybe more reasonable: $T \pm \widehat{d}$} \framesubtitle{But this is just me} \begin{center} \includegraphics[width=4in]{Symmetric} \end{center} \pause \vspace{3mm} where \begin{itemize} \item $\widehat{d}_1 = T - \widehat{Q}_{\alpha/2}$ \item $\widehat{d}_2 = \widehat{Q}_{1-\alpha/2} - T$ \item $\widehat{d} = \frac{1}{2}(\widehat{d}_1+\widehat{d}_2)$ \end{itemize} \end{frame} \begin{frame} \frametitle{Justifying the Assumption of Symmetry} \pause %\framesubtitle{} \begin{itemize} \item Smooth functions of asymptotic normals are asymptotically normal. \pause \item This includes functions of sample moments and MLEs. \pause \item Delta method: \pause \begin{itemize} \item[] $\sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T \sim N(0,\sigma^2)$ means $T_n$ is asymptotically normal. \pause \item[] $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} Y \sim N\left(0,g^\prime(\theta)^2 \, \sigma^2\right)$ means $g(T_n)$ is asymptotically normal too. \pause \end{itemize} \item Univariate and multivariate versions. \end{itemize} \end{frame} \begin{frame} \frametitle{Can use asymptotic normality directly} \pause %\framesubtitle{} Suppose $T$ is asymptotically normal. \pause \begin{itemize} \item Sample standard deviation of $T^*_1, \ldots, T^*_B$ is a good standard error. \pause \item Confidence interval is $T \pm 1.96 \, SE$. \pause \item If $T$ is a vector, the sample variance-covariance matrix of $T^*_1, \ldots, T^*_B$ is useful. \end{itemize} \end{frame} \begin{frame} \frametitle{Example} Let $Y_1, \ldots, Y_n$ be a random sample from an unknown distribution with expected value $\mu$ and variance $\sigma^2$. Give a point estimate and a 95\% confidence interval for the coefficient of variation $\frac{\sigma}{\mu}$. \pause \begin{itemize} \item Point estimate is $T=S/\overline{Y}$. \pause \item If $\mu \neq 0$ then $T$ is asymptotically normal and therefore symmetric. \pause \item Resample from the data urn $n$ times with replacement, and calculate $T^*_1$. \pause \item Repeat $B$ times, yielding $T^*_1, \ldots, T^*_B$. \pause \item Percentile confidence interval for $\frac{\sigma}{\mu}$ is $(\widehat{Q}_{\alpha/2},\widehat{Q}_{1-\alpha/2})$. \pause \item Alternatively, since $T$ is approximately normal, \pause calculate $\widehat{\sigma}_T = \frac{1}{B-1}\sum_{i=i}^B(T^*_i-\overline{T}^*)^2$ \pause \item And a 95\% confidence interval is $T \pm 1.96 \, \widehat{\sigma}_T$. \end{itemize} \end{frame} \section{Distribution-free regression example} \begin{frame} \frametitle{Example: Distribution-free regression} \pause \framesubtitle{} Independently for $i=1, \ldots, n$, let \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} \pause where \begin{itemize} \item $X_i$ and $\epsilon_i$ come from unknown distributions, \pause \item $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2$, \pause \item $X_i$ and $\epsilon_i$ are independent. \pause \item Moments of $X_i$ will be denoted $E(X)$, $E(X^2)$, etc. \pause \end{itemize} \vspace{10mm} Observable data consist of the pairs $(X_1,Y_1), \ldots, (X_n,Y_n)$. \end{frame} \begin{frame} \frametitle{Estimation} %\framesubtitle{} Estimate $\beta_0$ and $\beta_1$ as usual by \pause \begin{eqnarray*} \widehat{\beta}_1 & = & \frac{\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\sum_{i=1}^n(X_i-\overline{X})^2} \\ \\ & = & \frac{\sum_{i=1}^n X_i Y_i - n \overline{X} \, \overline{Y}} {\sum_{i=1}^n X_i^2 - n \overline{X}^2} \mbox{ and} \\ \\ \\ \widehat{\beta}_0 & = & \overline{Y} - \widehat{\beta}_1 \overline{X} \end{eqnarray*} \pause \begin{itemize} \item Consistency follows from the Law of Large Numbers and continuous mapping. \pause \item Looks like $\widehat{\beta}_0$ and $\widehat{\beta}_1$ are asymptotically normal. \pause \item Use this to get tests and confidence intervals. \end{itemize} \end{frame} % Omitted some multivariate delta method material. It's after the end{document} \begin{frame} \frametitle{Bootstrap approach: All by computer} \pause %\framesubtitle{} \begin{itemize} \item Earlier discussion implies $\widehat{\boldsymbol{\beta}}$ is asymptotically multivariate normal. \pause \item Say $\widehat{\boldsymbol{\beta}} \stackrel{.}{\sim} N_p(\boldsymbol{\beta},\mathbf{V})$. \pause \item All we need is a good $\widehat{\mathbf{V}}$. \pause \item Put data vectors $\mathbf{d}_i = (\mathbf{x}_i,Y_i)$ in a jar. \pause \item Sample $n$ vectors with replacement, yielding $\mathbf{D}_1^*$. Fit the regression model, obtaining $\widehat{\boldsymbol{\beta}}^*_1$. \pause \item Repeat $B$ times. This yields $\widehat{\boldsymbol{\beta}}^*_1 \ldots \widehat{\boldsymbol{\beta}}^*_B$. \pause \item The sample covariance matrix of $\widehat{\boldsymbol{\beta}}^*_1 \ldots \widehat{\boldsymbol{\beta}}^*_B$ is $\widehat{\mathbf{V}}$. \pause \item Under $H_0: \mathbf{L}\boldsymbol{\beta} = \mathbf{h}$, \pause \begin{displaymath} (\mathbf{L}\widehat{\boldsymbol{\beta}}-\mathbf{h})^\top (\mathbf{L}\widehat{\mathbf{V}}^{-1}\mathbf{L}^\top)^{-1} (\mathbf{L}\widehat{\boldsymbol{\beta}}-\mathbf{h}) \stackrel{.}{\sim} \chi^2(r) \end{displaymath} \end{itemize} \end{frame} \begin{frame} \frametitle{Remark} %\framesubtitle{} This is not a typical bootstrap regression. \pause \begin{itemize} \item Usually people fit a model and then bootstrap the residuals, not the whole data vector. \pause \item Bootstrapping the residuals applies to conditional regression (conditional on $\mathbf{X}=\mathbf{x}$). \pause \item Our regression model is unconditional. \pause \item The large-sample arguments are simpler in the unconditional case. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf17} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Set up the problem systematically} \pause %\framesubtitle{} Denoting the ``data vector" for case $i$ by $\mathbf{D}_i$, \pause {\small \begin{displaymath} \mathbf{D}_i = \left( \begin{array}{c} X_i \\ X^2_i \\ Y_i \\ X_i Y_i \end{array} \right), \pause \mbox{ and~~ } \overline{\mathbf{D}}_n = \left( \begin{array}{l} \frac{1}{n}\sum_{i=1}^n X_i \\\\ \frac{1}{n}\sum_{i=1}^n X^2_i \\\\ \frac{1}{n}\sum_{i=1}^n Y_i \\\\ \frac{1}{n}\sum_{i=1}^n X_i Y_i \end{array} \right) \pause \stackrel{a.s.}{\rightarrow} \left( \begin{array}{l} E(X) \\\\ E(X^2) \\\\ E(Y) \\\\ E(XY) \end{array} \right) = \boldsymbol{\mu} \end{displaymath} \pause } % End size Then $\left( \begin{array}{c} \widehat{\beta}_0 \\ \widehat{\beta}_1 \end{array} \right) = g(\overline{\mathbf{D}}_n) \stackrel{a.s.}{\rightarrow} g(\boldsymbol{\mu}) = \left( \begin{array}{c} \beta_0 \\ \beta_1 \end{array} \right)$ \end{frame} \begin{frame} \frametitle{What would we do next} \framesubtitle{To use the Central Limit Theorem and Delta method?} CLT says $\sqrt{n}(\overline{\mathbf{D}}_n-\boldsymbol{\mu}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N\left(\mathbf{0}, \boldsymbol{\Sigma} \right)$. Delta method says $\sqrt{n}(g(\overline{\mathbf{D}}_n)-g(\boldsymbol{\mu})) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\mu})\mathbf{\Sigma}\mbox{\.{g}}(\boldsymbol{\mu}) ^\prime)$, where {\small \begin{displaymath} \mathbf{D}_i = \left( \begin{array}{c} X_i \\ X^2_i \\ Y_i \\ X_i Y_i \end{array} \right) \end{displaymath} } \begin{itemize} \item Calculate $\boldsymbol{\Sigma}(\boldsymbol{\mu}) = V(\mathbf{D}_i)$ by hand. \item Calculate \.{g}$(\boldsymbol{\mu}) = \left[ \frac{\partial g_i}{\partial \mu_j} \right]$ by hand. \item Estimate $\boldsymbol{\Sigma}$ using sample moments, by computer. \item Calculate the estimated asymptotic covariance matrix $\frac{1}{n}\mbox{\.{g}}(\overline{\textbf{D}}_n) \widehat{\boldsymbol{\Sigma}}_n \mbox{\.{g}}(\overline{\textbf{D}}_n) ^\prime$ by computer \item Use that in confidence intervals and asymptotic tests. \end{itemize} \end{frame} \begin{frame} \frametitle{It's a lot of work.} %\framesubtitle{} \begin{itemize} \item Most problems have more than one explanatory variable \item You could easily make a ``little" mistake. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Background Reading} %\framesubtitle{It may be a little bit helpful.} \begin{itemize} \item Davison's \emph{Statistical models} has almost nothing. \item The best we can do for now is the \href{http://en.wikipedia.org}{Wikipedia} under \href{http://en.wikipedia.org/wiki/Bootstrapping_(statistics)} {Bootstrapping (Statistics)} \end{itemize} \end{frame}