% Large sample tools for Applied Stat I % Notes and comments are after the end of the document \documentclass[serif]{beamer} % Serif for Computer Modern math font. % \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Omitted Variables\footnote{See last slide for copyright information.}} \subtitle{STA302 Fall 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{The fixed $x$ regression model} $Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,p-1} + \epsilon_i, \mbox{ with } \epsilon_i \sim N(0,\sigma^2)$ \vspace{15mm} Think of the model as \emph{conditional} given $\mathbf{X}_i = \mathbf{x}_i$. \end{frame} \begin{frame} \frametitle{Independence of $\epsilon_i$ and $\mathbf{X}_i$} \begin{itemize} \item The statement $\epsilon_i \sim N(0,\sigma^2)$ is a statement about the \emph{conditional} distribution of $\epsilon_i$ given $\mathbf{X}_i = \mathbf{x}_i$. \item It says the density of $\epsilon_i$ given $\mathbf{X}_i = \mathbf{x}_i$ does not depend on $\mathbf{x}_i$. \item For convenience, assume $\mathbf{X}_i$ has a density. \end{itemize} % \vspace{5mm} \begin{eqnarray*} & & f_{\epsilon|\mathbf{x}}(\epsilon|\mathbf{x}) = f_{\epsilon}(\epsilon) \\ & \Rightarrow & \frac{f_{\epsilon,\mathbf{x}}(\epsilon,\mathbf{x})}{f_{\mathbf{x}}(\mathbf{x})} = f_{\epsilon}(\epsilon) \\ & \Rightarrow & f_{\epsilon,\mathbf{x}}(\epsilon,\mathbf{x}) = f_{\mathbf{x}}(\mathbf{x}) f_{\epsilon}(\epsilon) \end{eqnarray*} Independence! \end{frame} \begin{frame} \frametitle{The fixed $x$ regression model} \framesubtitle{$Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,p-1} + \epsilon_i, \mbox{ with }\epsilon_i \sim N(0,\sigma^2)$} \vspace{5mm} \begin{itemize} \item If viewed as conditional on $\mathbf{X}_i = \mathbf{x}_i$, this model implies independence of $\epsilon_i$ and $\mathbf{X}_i$. \item What is $\epsilon_i$? \emph{Everything else} that affects $Y_i$. \item So the usual model says that if the independent varables are random, they have \emph{zero covariance} with all other variables that are related to $Y_i$, but do not appear in the model. \item For observational data, this assumption is almost always violated. \item Does it matter? \end{itemize} \end{frame} \begin{frame} \frametitle{Example} Suppose that the variables $X_2$ and $X_3$ have an impact on $Y$ and are correlated with $X_1$, but they are not part of the data set. The values of the dependent variable are generated as follows: \begin{displaymath} Y_i = \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_2 X_{i,3} + \epsilon_i, \end{displaymath} independently for $i= 1, \ldots, n$, where $\epsilon_i \sim N(0,\sigma^2)$. The independent variables are random, with expected value and variance-covariance matrix \begin{displaymath} E\left[ \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right] = \left[ \begin{array}{c} \mu_1 \\ \mu_2 \\ \mu_3 \end{array} \right] \mbox{ ~and~ } V\left[ \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right] = \left[ \begin{array}{rrr} \phi_{11} & \phi_{12} & \phi_{13} \\ & \phi_{22} & \phi_{23} \\ & & \phi_{33} \end{array} \right], \end{displaymath} where $\epsilon_i$ is independent of $X_{i,1}$, $X_{i,2}$ and $X_{i,3}$. \end{frame} \begin{frame} \frametitle{Absorb $X_2$ and $X_3$} \begin{columns} % Use Beamer's columns to make narrower margins! \column{1.1\textwidth} Since $X_2$ and $X_3$ are not observed, they are absorbed by the intercept and error term. {\small \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_2 X_{i,3} + \epsilon_i \\ &=& (\beta_0 + \beta_2\mu_2 + \beta_3\mu_3) + \beta_1 X_{i,1} + (\beta_2 X_{i,2} + \beta_3 X_{i,3} - \beta_2\mu_2 - \beta_3\mu_3 + \epsilon_i) \\ &=& \beta^\prime_0 + \beta_1 X_{i,1} + \epsilon^\prime_i. \end{eqnarray*} } % End size And, \begin{displaymath} Cov(X_{i,1},\epsilon^\prime_i) = \beta_2\phi_{12} + \beta_3\phi_{13} \neq 0 \end{displaymath} \end{columns} \end{frame} \begin{frame} \frametitle{The ``True" Model} \framesubtitle{Almost always true for observational data} {\LARGE \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} } % End Size \vspace{5mm} where $E(X_i)=\mu_x$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \vspace{5mm} Under this model, \begin{displaymath} \sigma_{xy} = Cov(X_i,Y_i) = Cov(X_i,\beta_0 + \beta_1 X_i + \epsilon_i) = \beta_1 \sigma^2_x + c \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimate $\beta_1$ as usual} \begin{eqnarray*} \widehat{\beta}_1 &=& \frac{\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\sum_{i=1}^n(X_i-\overline{X})^2} \\ &=& \frac{\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2} \\ &=& \frac{\widehat{\sigma}_{xy}}{\widehat{\sigma}^2_x} \\ &\approx& \frac{\sigma_{xy}}{\sigma^2_x} \mbox{ for large } n\\ &=& \frac{\beta_1 \sigma^2_x + c}{\sigma^2_x} \\ &=& \beta_1 + \frac{c}{\sigma^2_x} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\widehat{\beta}_1 \approx \beta_1 + \frac{c}{\sigma^2_x}$} \begin{itemize} \item $\widehat{\beta}_1$ is biased, even as $n\rightarrow\infty$. \item It's inconsistent. \item It could be almost anything, depending on the value of $c$, the covariance between $X_i$ and $\epsilon_i$. \item The only time $\widehat{\beta}_1$ behaves properly is when $c=0$. \item Probability of Type I error goes almost surely to one. \item What if $\beta_1 < 0$ but $\beta_1 + \frac{c}{\sigma^2_x} > 0$? \end{itemize} \end{frame} \begin{frame} \frametitle{All this applies to multiple regression} \framesubtitle{Of course} \emph{When a regression model fails to include all the independent variables that contribute to the dependent variable, and those omitted independent variables have non-zero covariance with variables that are in the model, the regression coefficients are biased and inconsistent}. \end{frame} \begin{frame} \frametitle{Correlation-Causation} \begin{itemize} \item The problem of omitted variables is the technical version of the correlation-causation issue. \item The omitted variables are ``confounding" variables. \item With random assignment and good procedure, $x$ and $\epsilon$ have zero covariance. \item But random assignment is not always possible. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f13}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} \begin{frame} \frametitle{} % \framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%