% \documentclass[serif]{beamer} % Get Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} % To create handout using article mode: Comment above and uncomment below (2 places) %\documentclass[12pt]{article} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{amsmath} % For \binom{n}{y} %\usepackage{graphicx} % To include pdf files! %\usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Variance-stabilizing Transformations and Weighted Least Squares\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2016} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Unequal Variance} \begin{frame} \frametitle{Unequal Variance} \framesubtitle{Can you say ``heteroscedasticity?"} \begin{center} \includegraphics[width=3in]{breakline} \end{center} \end{frame} \begin{frame} \frametitle{Why is unequal variance a problem? } \framesubtitle{Not just because the model is wrong -- let's be more specific.} \pause \begin{itemize} \item Normal distribution theory depends on cancelling $\sigma^2$ in numerator and denominator. \pause \item There is some robustness. \pause Tests have approximately the right Type I error probability when the number of observations at each combination of $x$ values is large and roughly equal. \pause \item $\widehat{\boldsymbol{\beta}}$ is still unbiased, but no longer minimum variance. \pause \item Intuitively, observations where the variance is smaller should count more. \pause \item If the variance depends on $x$, prediction intervals should be wider for $x$ values with larger variance. \end{itemize} \end{frame} \begin{frame} \frametitle{Two solutions} \pause %\framesubtitle{} \begin{itemize} \item Variance-stabilizing transformations: \pause If the variance depends on $E(Y_i)$, transform the response variable. \pause \item Weighted least squares: \pause If the variance is proportional to some known constant, transform both $\mathbf{X}$ and $\mathbf{y}$. \end{itemize} \end{frame} \section{Delta Method} \begin{frame} \frametitle{The Delta Method} \pause The univariate version of the delta method says that if \begin{displaymath} \sqrt{n}\left( T_n- \theta \right) \stackrel{d}{\rightarrow} T \end{displaymath} \pause then \begin{displaymath} \sqrt{n}\left( g(T_n)- g(\theta) \right) \pause \stackrel{d}{\rightarrow} g^\prime(\theta) \, T. \end{displaymath} \pause If $T\sim N(0,\sigma^2)$, it says \vspace{3mm} \pause \begin{displaymath} \sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} \pause Y \sim N\left(0,g^\prime(\theta)^2 \, \sigma^2\right). \end{displaymath} \end{frame} \begin{frame} \frametitle{Taylor's Theorem} \framesubtitle{Basis of the Delta Method} \pause For the function $g(x)$, let the $n$th derivative $g^{(n)}$ be continuous in $[a,b]$ and differentiable in $(a,b)$, with $x$ and $x_0$ in $(a,b)$. \pause Then there exists a point $\xi$ between $x$ and $x_0$ \pause such that \pause {\footnotesize \begin{eqnarray*} g(x) & = & \pause g(x_0) \;+\; g^\prime(x_0)\,(x-x_0) \;+\; \frac{g^{\prime\prime}(x_0)(x-x_0)^2}{2!} \;+\; \ldots \\ \pause & + & \frac{g^{(n)}(x_0)(x-x_0)^n}{n!} \;+\; \frac{g^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!} , \end{eqnarray*} \pause } % End size where $R_n = \frac{g^{(n+1)}(\xi)(x-x_0)^{n+1}}{(n+1)!}$ is called the \emph{remainder term}. \vspace{2mm} \pause If $R_n \rightarrow 0$ as $n \rightarrow \infty$, the resulting infinite series is called the \emph{Taylor Series} for $g(x)$. % There are other forms for the remainder term (with a different value of $\xi$) that sometimes prove useful. \end{frame} \begin{frame} \frametitle{Two terms of a Taylor Series} \framesubtitle{Plus remainder} {\Large \begin{displaymath} g(x) = g(x_0) \,+\, g^\prime(x_0)\,(x-x_0) \,+\, \frac{g^{\prime\prime}(\xi)(x-x_0)^2}{2!} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{Proof of the Delta Method} \framesubtitle{Using $g(x) = g(x_0) + g^\prime(x_0)\,(x-x_0) + \frac{g^{\prime\prime}(\xi)(x-x_0)^2}{2!}$} \pause Suppose $\sqrt{n}\left( T_n-\theta \right) \stackrel{d}{\rightarrow} T$. \pause Then expanding $g(x)$ about $\theta$, \pause {\footnotesize \begin{eqnarray*} \sqrt{n}\left( {\color{red}g(T_n)}- g(\theta) \right) \pause & = & \sqrt{n}\left( { \color{red}g(\theta) + g^\prime(\theta)\,(T_n-\theta) + \frac{g^{\prime\prime}(\xi_n)(T_n-\theta)^2}{2!} } - g(\theta) \right) \\ \pause & = & g^\prime(\theta)\, {\color{blue}\sqrt{n}\left( T_n- \theta \right) } \: + \\ \pause & & \hspace{5mm} \frac{1}{2} g^{\prime\prime}(\xi_n) \,\cdot\, \sqrt{n}\left( T_n- \theta \right) \,\cdot\, (T_n-\theta) \\ \pause & & \hspace{8mm} p\downarrow \hspace{15mm} d\downarrow \hspace{10mm} p\downarrow \\ & & \hspace{8mm} 1/2 \, \theta \hspace{15mm} T \hspace{13mm} 0 \\ \pause & \stackrel{d}{\rightarrow} & g^\prime(\theta) T \pause \hspace{2mm}+\hspace{15mm} 0 \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{A variance-stabilizing transformation} \framesubtitle{An application of the delta method} \pause \begin{itemize} \item Because the Poisson process is such a good model, count data often have approximate Poisson distributions. \pause \item Let $X_1, \ldots, X_n \stackrel{i.i.d}{\sim}$ Poisson$(\lambda)$ \pause \item $E(X_i)=Var(X_i)=\lambda$ \pause \item CLT says $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} T \sim N(0,\lambda)$. \pause \item Delta method says $\sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) \stackrel{d}{\rightarrow} g^\prime(\lambda) \, T \pause = Y \sim N\left(0,g^\prime(\lambda)^2 \, \lambda\right)$ \pause \item If $g^\prime(\lambda) = \frac{1}{\sqrt{\lambda}}$, \pause then $Y \sim N(0,1)$. \end{itemize} \end{frame} \begin{frame} \frametitle{An elementary differential equation: $g^\prime(x) = \frac{1}{\sqrt{x}}$} \framesubtitle{Solve by separation of variables} \pause {\LARGE \begin{eqnarray*} & & \frac{dg}{dx} = x^{-1/2} \\ \pause & \Rightarrow & dg = x^{-1/2} \, dx\\ \pause & \Rightarrow & \int dg = \int x^{-1/2} \, dx\\ \pause & \Rightarrow & g(x) = \frac{x^{1/2}}{1/2} + c = 2 x^{1/2} + c \end{eqnarray*} } \end{frame} \begin{frame} \frametitle{We have found} \begin{eqnarray*} \sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) & = & \sqrt{n}\left( 2\overline{X}_n^{1/2}- 2\lambda^{1/2} \right) \\ \pause & \stackrel{d}{\rightarrow} & Z \sim N(0,1) \end{eqnarray*} \pause \begin{itemize} \item We could say that $\sqrt{\overline{X}_n}$ is asymptotically normal, \pause with mean $\sqrt{\lambda}$ and variance $\frac{1}{4n}$. \pause \item This is because $\overline{X}_n^{1/2} = \frac{Z}{2\sqrt{n}}+\sqrt{\lambda}$. \pause \item Notice that the variance no longer depends on $\lambda$. \pause \item This calculation could justify a square root transformation for count data. \pause \item Because if $\overline{X}_n$ is asymptotically normal, so is $\sum_{i=1}^n X_i$ \pause \item And the sum of independent Poissons is Poisson. \end{itemize} \end{frame} \begin{frame} \frametitle{Sometimes it can be pretty loose} \framesubtitle{Just drop the remainder term in $g(x) = g(x_0) + g^\prime(x_0)(x-x_0) + R $} \pause If $Var(X) = \sigma^2$, then \pause \begin{eqnarray*} Var({\color{red}g(X)}) & \approx & Var\left( {\color{red}g(x_0) + g^\prime(x_0)(X-x_0)} \right) \\ \pause & = & Var\left( g^\prime(x_0)X \right) \\ \pause & = & g^\prime(x_0)^2Var(X) \\ \pause & = & g^\prime(x_0)^2\sigma^2 \pause \end{eqnarray*} Call it ``linearization." \pause \vspace{4mm} The approximation $g(x) = g(x_0) + g^\prime(x_0)(x-x_0)$ is good, for $x$ close to $x_0$. \end{frame} \begin{frame} \frametitle{The arcsin-square root transformation for proportions} \framesubtitle{This is careful again.} \pause Sometimes, variable values consist of proportions, one for each case. \pause \begin{itemize} \item For example, cases could be hospitals. \pause \item The variable of interest is the proportion of patients who came down with something \emph{unrelated} to their reason for admission -- hospital-acquired infection. \pause \item This is an example of \emph{aggregated data}. \end{itemize} \end{frame} \begin{frame} \frametitle{The advice you often get} \pause When a proportion is the response variable in a regression, use the \emph{arcsin square root} transformation. \pause \vspace{5mm} That is, if the proportions are $P_1, \ldots, P_n$, \pause let \begin{displaymath} Y_i = 2\sin^{-1}(\sqrt{P_i}) \end{displaymath} \pause and use the $Y_i$ values in your regression. \vspace{5mm} \pause \begin{center}{\huge \textbf{Why?}} \end{center} \end{frame} \begin{frame} \frametitle{It's a variance-stabilizing transformation.} \pause \begin{itemize} \item The proportions are little sample means: $P_i = \frac{1}{m}\sum_{j=1}^m X_{i,j}$ \item Drop the $i$ for now. \pause \item $X_1, \ldots, X_m$ may not be independent, but let's pretend. \pause \item $P = \overline{X}_m$ \pause \item Approximately, $\overline{X}_m \sim N\left(\theta,\frac{\theta(1-\theta)}{m}\right)$ \pause \item Normality is good. \pause \item Variance that depends on the mean $\theta$ is not so good. \end{itemize} \end{frame} \begin{frame} \frametitle{Apply the delta method} \pause Central Limit Theorem says \pause {\Large \begin{displaymath} \sqrt{m}(\overline{X}_m-\theta) \stackrel{d}{\rightarrow} T \sim N\left(0,\theta(1-\theta)\right) \end{displaymath} } \pause Delta method says \pause {\Large \begin{displaymath} \sqrt{m}\left( g(\overline{X}_m)- g(\theta) \right) \stackrel{d}{\rightarrow} Y \sim N\left(0,g^\prime(\theta)^2 \,\theta(1-\theta)\right). \end{displaymath} } \pause Want a function $g(x)$ with \pause {\Large \begin{displaymath} g^\prime(x) = \frac{1}{\sqrt{x(1-x)}} \end{displaymath} } \pause Try $g(x) = 2\sin^{-1}\left(\sqrt{x}\right)$. \end{frame} \begin{frame} \frametitle{Chain rule to get $\frac{d}{dx}\sin^{-1}\left(\sqrt{x}\right)$} \pause ``Recall" that $\frac{d}{dx}\sin^{-1}(x) = \frac{1}{\sqrt{1-x^2}}$. \pause Then, \vspace{5mm} \begin{eqnarray*} \frac{d}{dx}2\sin^{-1}\left(\sqrt{x}\right) \pause & = & 2 \frac{1}{\sqrt{1-\sqrt{x}^2}} \pause \,\cdot\, \frac{1}{2} x^{-1/2} \\ \pause & = & \frac{1}{\sqrt{x(1-x)}} \pause \mbox{ For } 0 < x < 1. \\ \end{eqnarray*} \pause \vspace{5mm} Conclusion: \begin{displaymath} \sqrt{m}\left( 2\sin^{-1}\sqrt{\overline{X}_m} - 2\sin^{-1}\sqrt{\theta}\right) \stackrel{d}{\rightarrow} Y \sim N(0,1) \end{displaymath} \end{frame} \begin{frame} \frametitle{So the arcsin-square root transformation stabilizes the variance} \framesubtitle{Because $ \sqrt{m}\left( 2\sin^{-1}\sqrt{\overline{X}_m} - 2\sin^{-1}\sqrt{\theta}\right) \stackrel{d}{\rightarrow} Y \sim N(0,1)$} \pause \begin{itemize} \item If we want to do a regression on aggregated data, the point we have reached is that approximately, \pause {\Large \begin{displaymath} Y_i \sim N\left( 2\sin^{-1}\sqrt{\theta_i} \, , \, \frac{1}{m_i} \right) \pause \end{displaymath} } \item The variance no longer depends on the probability that the proportion is estimating. \pause \item $Y$ is meaningful because the function $g(x)$ is increasing. \pause \item But the variance still depends on the number of patients in the hospital. \end{itemize} % There are so many good questions about this. % At least the transformation is an increasing function. Call it "chance" of the event. % Weighted least squares. I wonder how much it matters. % For predictions, unwrap the functions % In a regression, do you really need to estimate the variance with MSE? For a small number of cases but proportions based on a lot of instances, this could matter. % Wonder how harmful the within-case iid assumption really is. \end{frame} \section{Weighted Least Squares} \begin{frame} \frametitle{Weighted Least Squares} \pause %\framesubtitle{} \begin{itemize} \item Suppose that the variances of $Y_1, \ldots, Y_n$ are unequal, but proportional to known constants. \pause \item Aggregated data fit this pattern. Means are usually based on different sample sizes. \pause \item Generalize it: \pause In the regression model $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$, \pause \item[] $cov(\boldsymbol{\epsilon}) = \sigma^2 \mathbf{V}$, \pause with $\mathbf{V}$ a \emph{known} symmetric positive definite matrix. \end{itemize} \end{frame} \begin{frame} \frametitle{Transform the Data} \pause \framesubtitle{} Have $\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}$ with $cov(\boldsymbol{\epsilon}) = \sigma^2 \mathbf{V}$. \vspace{5mm} \pause \renewcommand{\arraystretch}{1.5} \begin{displaymath} \begin{array}{cccccc} & \mathbf{y} &=& \mathbf{X} \boldsymbol{\beta} &+& \boldsymbol{\epsilon} \\ \pause \Rightarrow & \mathbf{V}^{-1/2}\mathbf{y} &=& \mathbf{V}^{-1/2}\mathbf{X} \boldsymbol{\beta} &+& \mathbf{V}^{-1/2}\boldsymbol{\epsilon} \\ \pause & \mathbf{y}^* &=& \mathbf{X}^* \boldsymbol{\beta} &+& \boldsymbol{\epsilon}^* \\ \end{array} \end{displaymath} \pause \renewcommand{\arraystretch}{1.0} So that \begin{itemize} \item $cov(\boldsymbol{\epsilon}^*) = \sigma^2 \mathbf{I}_n$ \pause \item Note that the transformed model has the same $\boldsymbol{\beta}$. \end{itemize} \end{frame} \begin{frame} \frametitle{You don't have to literally transform the data} \framesubtitle{Just transform the estimates, tests and intervals} \pause {\footnotesize \begin{itemize} \item $\widehat{\boldsymbol{\beta}}_{wls} = (\mathbf{X}^\top\mathbf{V}^{-1}\mathbf{X})^{-1} \mathbf{X}^\top \mathbf{V}^{-1} \mathbf{y}$ \pause and so on. \pause \item The most common case is where the variances are proportional to known constants and the errors are independent. \pause That is, $\mathbf{V}$ is diagonal. \pause \item Most software will allow you to supply the diagonal elements of $\mathbf{V}^{-1}$. \pause \item These are called the ``weights." \pause \item In the case of aggregated data where $Var(Y_i) = \frac{\sigma^2}{m_i}$, the weights are just $m_1, \ldots m_n$. \pause \item in \texttt{help(lm)}, R's help says \pause \vspace{2mm} %\begin{quote} Non-NULL weights can be used to indicate that different observations have different variances (with the values in weights being inversely proportional to the variances); or equivalently, when the elements of \texttt{weights} are positive integers $w_i$, that each response $y_i$ is the mean of $w_i$ unit-weight observations (including the case that there are $w_i$ observations equal to $y_i$ and the data have been summarized). %\end{quote} \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Sometimes weighted least squares is used loosely} \framesubtitle{Is this an abuse?} \pause \begin{itemize} \item Residual plots suggest that variance might be proportional to $x_{ij}$. \pause \item So pretend it's known, \pause and use weights $\frac{1}{x_{1j}}, \ldots, \frac{1}{x_{nj}}$. \pause \item This has been studied. The Wikipedia article has references. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf16} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf16}} \end{frame} \end{document} \sin^{-1}\left(\sqrt{\overline{X}_m}\right) \sin^{-1}\left(\sqrt{\theta}\right) \begin{displaymath} \sqrt{m}\left( g(\overline{X}_m)- g(\theta) \right) \stackrel{d}{\rightarrow} Y \sim N\left(0,(g^\prime(\theta))^2\theta(1-\theta)\right). \end{displaymath} } CLT says $\sqrt{n}(\overline{X}_n-\lambda) \stackrel{d}{\rightarrow} T \sim N(0,\lambda)$. $\sqrt{n}\left( g(\overline{X}_n)- g(\lambda) \right) = \sqrt{n}\left( 2\sqrt{\overline{X}_n}- 2\sqrt{\lambda} \right) \stackrel{d}{\rightarrow} Z \sim N(0,1)$ \Pr\{-z < 2\sqrt{n}\left(\sqrt{\overline{X}_n}-\sqrt{\lambda}\right) < z \} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%