% \documentclass[serif]{beamer} % Get Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{amsmath} % For \binom{n}{y} %\usepackage{graphicx} % To include pdf files! %\usepackage{fullpage} \usepackage{alltt} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top %\usetheme{Frankfurt} % Blue default: Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usetheme{AnnArbor} % CambridgeUS % I'm using this one (yellow) just to be different from Dehan. \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{The Bootstrap\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Sampling distributions} \begin{frame} \frametitle{Sampling distributions} %\framesubtitle{} \begin{itemize} \item Let $\mathbf{x} = (X_1, \ldots, X_n)$ be a random sample from some distribution $F$. \item $t=t(\mathbf{x})$ is a statistic (could be a vector of statistics). \item Need to know about the distribution of $t$. \item Sometimes it's not easy, even asymptotically. \end{itemize} \end{frame} \begin{frame} \frametitle{Sampling distribution of $t$: The elementary version} \framesubtitle{For example $t = \overline{X}$} \pause \begin{itemize} \item Sample repeatedly from this population (pretend). \item For each sample, calculate $t$. \item Make a relative frequency histogram of the $t$ values you observe. \item As the number of samples becomes very large, the histogram approximates the distribution of $t$. \end{itemize} \end{frame} \section{Bootstrap} \begin{frame} \frametitle{Bootstrap?} \framesubtitle{Pull yourself up by your bootstraps} \begin{center} \includegraphics[width=2.5in]{Dr_Martens,_black,_old.jpg} \end{center} {\scriptsize This photograph was taken by Tarquin. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. For more information, see the entry at the \href{http://commons.wikimedia.org/wiki/File:Dr_Martens,_black,_old.jpg} {wikimedia site}. } % End size \end{frame} \begin{frame} \frametitle{The (statistical) Bootstrap} \framesubtitle{Bradley Efron, 1979} \pause \begin{itemize} \item Select a random sample from the population. \item If the sample size is large, the sample is similar to the population. \pause \item Sample repeatedly \emph{from the sample.} This is called \emph{resampling}. \pause \item Sample from the sample? Think of putting the sample data values in a jar \ldots \pause \item Calculate the statistic for every bootstrap sample. \item A histogram of the resulting values approximates the shape of the sampling distribution of the statistic. \end{itemize} \end{frame} \begin{frame} \frametitle{Notation} %\framesubtitle{} \begin{itemize} \item Let $\mathbf{x} = (X_1, \ldots, X_n)$ be a random sample from some distribution $F$. \item $t=t(\mathbf{x})$ is a statistic (could be a vector of statistics). \pause \item Form a ``bootstrap sample" $\mathbf{x}^*$ by sampling $n$ values from $\mathbf{x}$ \emph{with replacement}. \pause \item Repeat this process $B$ times, obtaining $\mathbf{x}^*_1, \ldots, \mathbf{x}^*_B$. \pause \item Calculate the statistic for each bootstrap sample, obtaining $t^*_1, \ldots, t^*_B$. \pause \item Relative frequencies of $t^*_1, \ldots, t^*_B$ approximate the sampling distribution of $t$. \end{itemize} \end{frame} \begin{frame} \frametitle{Why does it work?} \framesubtitle{Empirical distribution function} \begin{displaymath} \widehat{F}(x) = \frac{1}{n}\sum_{i=1}^nI\{X_i \leq x\} \pause \stackrel{p}{\rightarrow} E(I\{X_i \leq x \}) \pause = F(x) \end{displaymath} \pause \begin{itemize} \item Resampling from $\mathbf{x}$ with replacement is the same as simulating a random variable whose distribution is the empirical distribution function $\widehat{F}(x)$. \pause \item Suppose the distribution function of $t$ is a nice smooth function of $F$. \pause \item Then as $n\rightarrow\infty$ and $B\rightarrow\infty$, bootstrap sample moments and quantiles of $t^*_1, \ldots, t^*_B$ converge to the corresponding moments and quantiles of the unknown distribution of $t$. \pause \item If the distribution of $\mathbf{x}$ is discrete and supported on a finite number of points, the technical issues are minor. \end{itemize} \end{frame} \begin{frame} \frametitle{Main Application for This Course} \framesubtitle{Skipping quantile bootstrap confidence intervals and many other interesting things} \begin{itemize} \item $t = \widehat{\boldsymbol{\theta}}_n$. \pause \item Even when the data are non-normal and the model is wrong, $\widehat{\boldsymbol{\theta}}_n$ is asymptotically normal and converges to a definite target, provided the MLE is unique. \pause \item For the models that appear in this class, \item If the model is correct (except for the distribution) and the parameters are identifiable, $\widehat{\boldsymbol{\theta}}_n$ is consistent as well as asymptotically normal. \pause \item The only problem is that the variances and covariances in $\mathbf{V}_n = \frac{1}{n}\mathcal{I}(\boldsymbol{\theta})$ may be wrong. \item Need a different asymptotic covariance matrix (sometimes). \end{itemize} \end{frame} \begin{frame} \frametitle{Bootstrap the covariance matrix of $\widehat{\boldsymbol{\theta}}_n$} %\framesubtitle{} \begin{itemize} \item Asymptotic distribution is multivariate normal \item Centered on the right thing. \item The only other thing we need to know about the distribution of $\widehat{\boldsymbol{\theta}}_n$ is its covariance matrix. \end{itemize} \end{frame} \begin{frame} \frametitle{Procedure} %\framesubtitle{} \begin{itemize} \item The data `jar" contains not balls with single numbers, but strings of beads with a vector of observed values $\mathbf{d}_i$ written on them. Data values for a case stay together. \pause \item Select $n$ strings of beads with replacement, obtaining $\mathbf{x}^*_1$. \pause. \item Do this $B$ times. Now you have $\mathbf{x}^*_1, \ldots, \mathbf{x}^*_B$. \item Calculate $\widehat{\boldsymbol{\theta}}^*_1, \ldots \widehat{\boldsymbol{\theta}}^*_B$. \pause \item You have a lot of information about the multivariate distribution of $\widehat{\boldsymbol{\theta}}_n$, but all you care about is the covariance matrix. \pause \item If there are $m$ parameters, you have a $B \times m$ matrix of numbers, with one column for each parameter in the model. \item Calculate the sample covariance matrix for the data (using \texttt{var}). \item This is the new $\widehat{\mathbf{V}}_n$. \item Use it for Wald tests and $z$-tests. \pause \item All this applies to MOM as well as MLE. \end{itemize} \end{frame} \begin{frame} \frametitle{Sometimes it's Unnecessary} % \framesubtitle{} \begin{itemize} \item Linear structural equation models have a lot of robustness to the multivariate normal assumption. \item When it fails, it's usually for data with ``excess kurtosis" (heavy tails). \item And even then, not necessarily for all parameters. \pause \item Trouble arises when the variance of the sample variance is involved. \begin{displaymath} Var\left( \frac{1}{n} \sum_{i=1}^n(x_i-\overline{x}_n)^2 \right) \end{displaymath} Fourth moments of the normal distribution will be too small, leading to an under-estimate. \pause \item For the double measurement design, standard errors of the regression coefficients are robust to normality. \end{itemize} \end{frame} \section{Example} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example: Double measurement} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(157,50){\framebox{$W_1$}} \put(232,50){\framebox{$W_2$}} \put(197,15){\vector(-1,1){25}} % X -> W1 \put(209,15){\vector(1,1){25}} % X -> W2 \put(161,95){$e_1$} % x = V2+4 \put(165,90){\vector(0,-1){25}} % e1 -> W1 \put(236,95){$e_2$} % x = V3+4 \put(240,90){\vector(0,-1){25}} % e2 -> W2 \end{picture} \end{center} % \vspace{3mm} \begin{eqnarray} W_1 & = & X + e_1 \nonumber \\ W_2 & = & X + e_2, \nonumber \end{eqnarray} where $E(X)=\mu$, $Var(X)=\phi$, $E(e_1)=E(e_2)=0$, $Var(e_1)= \omega_1$, $Var(e_2)=\omega_2$, and $X$, $e_1$ and $e_2$ are all independent. \end{frame} \begin{frame} \frametitle{Equivalent measurements?} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(157,50){\framebox{$W_1$}} \put(232,50){\framebox{$W_2$}} \put(197,15){\vector(-1,1){25}} % X -> W1 \put(209,15){\vector(1,1){25}} % X -> W2 \put(161,95){$e_1$} % x = V2+4 \put(165,90){\vector(0,-1){25}} % e1 -> W1 \put(236,95){$e_2$} % x = V3+4 \put(240,90){\vector(0,-1){25}} % e2 -> W2 \end{picture} \end{center} If $\omega_1 = Var(e_1)$ and $\omega_2 = Var(e_2)$ are equal, $W_1$ and $W_2$ are \emph{equivalent measurements}, and $Corr(W_1,W_2) = \frac{\phi}{\phi+\omega}$, the reliability. \end{frame} % HW: If unequal, can you still estimate the two different reliabilities? \begin{frame}[fragile] \frametitle{Simulate from the $t$ Distribution: Heavy-tailed } \framesubtitle{$Var(t) = \nu/(\nu-2)$, so with $\nu=3$, $Var(t)=3$} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> rm(list=ls()) > # Parameter values and sample size > phi = 7; omega1 = 3; omega2 = 3 > rel1 = round(phi/(phi+omega1),3); rel2 = round(phi/(phi+omega2),3) > c(rel1,rel2) # Reliabilities } [1] 0.7 0.7 {\color{blue}> n = 1500 > # Simulate from t distribution -- heavy tails > # Var(t) = nu/(nu-2) > set.seed(9999) > x = sqrt(phi) * rt(n,3)/sqrt(3) > e1 = sqrt(omega1) * rt(n,3)/sqrt(3); e2 = sqrt(omega2) * rt(n,3)/sqrt(3) > w1 = x + e1; w2 = x + e2 > ww = cbind(w1,w2) > vcovW = var(ww) * (n-1)/n; vcovW # Divide by n to get MLEs } w1 w2 w1 10.120663 6.727376 w2 6.727376 9.347715 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Normal Theory Fit with \texttt{lavaan}} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # install.packages("lavaan", dependencies = TRUE) # Only need to do this once > library(lavaan) } {\color{red}This is lavaan 0.6-11 lavaan is FREE software! Please report any bugs. } {\color{blue}> # Normal theory with lavaan > mod = "x =~ 1.0*w1 + 1.0*w2 + x ~~ phi*x; w1 ~~ omega1*w1; w2 ~~ omega2*w2 + vardiff := omega1-omega2 + " > fit = lavaan(mod, data=ww) > # summary(fit) > parameterEstimates(fit) } lhs op rhs label est se z pvalue ci.lower ci.upper 1 x =~ w1 1.000 0.000 NA NA 1.000 1.000 2 x =~ w2 1.000 0.000 NA NA 1.000 1.000 3 x ~~ x phi 6.727 0.305 22.031 0.000 6.129 7.326 4 w1 ~~ w1 omega1 3.393 0.220 15.448 0.000 2.963 3.824 5 w2 ~~ w2 omega2 2.620 0.205 12.778 0.000 2.218 3.022 6 vardiff := omega1-omega2 vardiff 0.773 0.364 2.124 0.034 0.060 1.486 {\color{blue}> thetahat = coef(fit); thetahat } phi omega1 omega2 6.727 3.393 2.620 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Bootstrap} %\framesubtitle{} {\footnotesize % or scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> # Bootstrap the "hard" way > # n = dim(ww)[1] is not needed > jar = 1:n; B = 1000 > tstar = matrix(NA,B,3) # Rows will hold theta-hat values > colnames(tstar) = names(coef(fit)) > for(j in 1:B) + \{ + rowz = sample(jar,size=n,replace=TRUE) + xstar = ww[rowz,] + fitstar = lavaan(mod, data=xstar) + tstar[j,] = coef(fitstar) + \} # Next bootstrap sample \pause > head(tstar) } phi omega1 omega2 [1,] 6.969279 4.360700 2.182922 [2,] 6.324895 4.075226 2.259924 [3,] 6.607809 3.034017 2.047602 [4,] 6.931564 3.314822 3.254835 [5,] 6.157233 3.992400 2.434781 [6,] 8.465813 3.019230 2.719412 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Sampling Distribution of $\widehat{\omega_1} - \widehat{\omega_2}$} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> vdiff = tstar[,2] - tstar[,3] # Vector of omega1hat - omega2hat values > hist(vdiff) } \end{alltt} \begin{center} \includegraphics[width=1.9in]{vdiffHist} \end{center} \pause \begin{alltt} {\color{blue}> shapiro.test(vdiff) # Test of normality } Shapiro-Wilk normality test data: vdiff W = 0.99873, p-value = 0.7097 \end{alltt} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Standard error of $\widehat{\omega_1} - \widehat{\omega_2}$} %\framesubtitle{} {\scriptsize % The alltt environment requires \usepackage{alltt} \begin{alltt} {\color{blue}> var(vdiff) } [1] 0.2889961 {\color{blue}> bootse = sqrt(var(vdiff)) > bootse # Compare normal theory estimate of 0.364 } [1] 0.5375836 \pause {\color{blue}> z = (thetahat[2]-thetahat[3])/bootse; z # Compare z = 2.124 } omega1 1.437819 \pause {\color{blue}> # Now bootstrap with lavaan: The easy way > fitB = lavaan(mod, data=ww, se = "bootstrap") \pause > parameterEstimates(fitB) } lhs op rhs label est se z pvalue ci.lower ci.upper 1 x =~ w1 1.000 0.000 NA NA 1.000 1.000 2 x =~ w2 1.000 0.000 NA NA 1.000 1.000 3 x ~~ x phi 6.727 0.922 7.295 0.000 5.255 8.734 4 w1 ~~ w1 omega1 3.393 0.386 8.781 0.000 2.685 4.213 5 w2 ~~ w2 omega2 2.620 0.353 7.419 0.000 1.996 3.390 6 vardiff := omega1-omega2 vardiff 0.773 0.525 1.473 0.141 -0.192 1.833 \end{alltt} } % End size \end{frame} \begin{frame} \frametitle{Advantages and Disadvantages} \framesubtitle{Of bootstrapping the normal MLEs} Advantages \begin{itemize} \item No assumptions about the distribution of the data. \item Works for \emph{any} linear structural equation model provided the observed data have finite fourth moments. \item It's easy. \pause \end{itemize} Disadvantages \begin{itemize} \item It might take a minute or two. \item The answer is slightly different every time. \item You need the raw data. \pause \end{itemize} \begin{displaymath} L(\boldsymbol{\mu,\Sigma}) = |\boldsymbol{\Sigma}|^{-n/2} (2\pi)^{-np/2} \exp -\frac{n}{2}\left\{ tr(\boldsymbol{\widehat{\Sigma}\Sigma}^{-1}) + (\overline{\mathbf{d}}-\boldsymbol{\mu})^\top \boldsymbol{\Sigma}^{-1} (\overline{\mathbf{d}}-\boldsymbol{\mu}) \right\} \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/431s23} {\footnotesize \texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}} \end{frame} \end{document} # R code for the slides in this set. # Testing differences between error variances, to see # if we have equivalent measurements. rm(list=ls()) # Parameter values and sample size phi = 7; omega1 = 3; omega2 = 3 rel1 = round(phi/(phi+omega1),3); rel2 = round(phi/(phi+omega2),3) c(rel1,rel2) # Reliabilities n = 1500 # Simulate from t distribution -- heavy tails # Var(t) = nu/(nu-2) set.seed(9999) x = sqrt(phi) * rt(n,3)/sqrt(3) e1 = sqrt(omega1) * rt(n,3)/sqrt(3); e2 = sqrt(omega2) * rt(n,3)/sqrt(3) w1 = x + e1; w2 = x + e2 ww = cbind(w1,w2) vcovW = var(ww) * (n-1)/n; vcovW # Divide by n to get MLEs # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) # Normal theory with lavaan mod = "x =~ 1.0*w1 + 1.0*w2 x ~~ phi*x; w1 ~~ omega1*w1; w2 ~~ omega2*w2 vardiff := omega1-omega2 " fit = lavaan(mod, data=ww) # summary(fit) parameterEstimates(fit) thetahat = coef(fit); thetahat # Bootstrap the "hard" way # n = dim(ww)[1] is not needed jar = 1:n; B = 1000 tstar = matrix(NA,B,3) # Rows will hold theta-hat values colnames(tstar) = names(coef(fit)) for(j in 1:B) { rowz = sample(jar,size=n,replace=TRUE) xstar = ww[rowz,] fitstar = lavaan(mod, data=xstar) tstar[j,] = coef(fitstar) } # Next bootstrap sample head(tstar) vdiff = tstar[,2] - tstar[,3] # Vector of omega1hat - omega2hat values hist(vdiff) shapiro.test(vdiff) # Test of normality var(vdiff) bootse = sqrt(var(vdiff)) bootse # Compare normal theory estimate of 0.364 z = (thetahat[2]-thetahat[3])/bootse; z # Compare z = 2.124 # Now bootstrap with lavaan: The easy way fitB = lavaan(mod, data=ww, se = "bootstrap") parameterEstimates(fitB)