% Omitted variables for STA302 % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Omitted Variables and Instrumental Variables\footnote{See last slide for copyright information.}} \subtitle{STA305 Fall 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \section{Omitted Variables} \begin{frame} \frametitle{The fixed $x$ regression model} $y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,k} + \epsilon_i, \mbox{ with } \epsilon_i \sim N(0,\sigma^2)$ \pause \vspace{15mm} Think of the model as \emph{conditional} given $\mathbf{X}_i = \mathbf{x}_i$. \end{frame} \begin{frame} \frametitle{Independence of $\epsilon_i$ and $\mathbf{X}_i$} \pause \begin{itemize} \item The statement $\epsilon_i \sim N(0,\sigma^2)$ is a statement about the \emph{conditional} distribution of $\epsilon_i$ given $\mathbf{X}_i$. \pause \item It says the density of $\epsilon_i$ given $\mathbf{X}_i$ does not depend on $\mathbf{X}_i$. \pause \item For convenience, assume $\mathbf{X}_i$ has a density. \pause \end{itemize} % \vspace{5mm} \begin{eqnarray*} & & f_{\epsilon|\mathbf{x}}(\epsilon|\mathbf{X}) = f_{\epsilon}(\epsilon) \\ \pause & \Rightarrow & \frac{f_{\epsilon,\mathbf{x}}(\epsilon,\mathbf{x})}{f_{\mathbf{x}}(\mathbf{X})} = f_{\epsilon}(\epsilon) \\ \pause & \Rightarrow & f_{\epsilon,\mathbf{x}}(\epsilon,\mathbf{X}) = f_{\mathbf{x}}(\mathbf{X}) f_{\epsilon}(\epsilon) \end{eqnarray*} \pause Independence! \end{frame} \begin{frame} \frametitle{The fixed $x$ regression model} \begin{displaymath} y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,p-1} + \epsilon_i, \mbox{ with } \epsilon_i \sim N(0,\sigma^2) \end{displaymath} \pause \vspace{5mm} \begin{itemize} \item If viewed as conditional on $\mathbf{x}_i$, this model implies independence of $\epsilon_i$ and $\mathbf{x}_i$, because the conditional distribution of $\epsilon_i$ given $\mathbf{x}_i$ does not depend on $\mathbf{x}_i$. \pause \item What is $\epsilon_i$? \emph{Everything else} that affects $y_i$. \pause \item So the usual model says that if the independent varables are random, they have \emph{zero covariance} with all other variables that are related to $y_i$, but are not included in the model. \pause \item For observational data (no random assignment), this assumption is almost always violated. \pause \item Does it matter? \end{itemize} \end{frame} \begin{frame} \frametitle{Example} Suppose that the variables $x_2$ and $x_3$ have an impact on $y$ and are correlated with $x_1$, but they are not part of the data set. \pause The values of the dependent variable are generated as follows: \pause \begin{displaymath} y_i = \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \beta_2 x_{i,3} + \epsilon_i, \end{displaymath} independently for $i= 1, \ldots, n$, where $\epsilon_i \sim N(0,\sigma^2)$. \pause The independent variables are random, with expected value and variance-covariance matrix \pause \begin{displaymath} E\left( \begin{array}{c} x_{i,1} \\ x_{i,2} \\ x_{i,3} \end{array} \right) = \left( \begin{array}{c} \mu_1 \\ \mu_2 \\ \mu_3 \end{array} \right) \mbox{ ~and~ } cov\left( \begin{array}{c} x_{i,1} \\ x_{i,2} \\ x_{i,3} \end{array} \right) = \left( \begin{array}{rrr} \phi_{11} & \phi_{12} & \phi_{13} \\ & \phi_{22} & \phi_{23} \\ & & \phi_{33} \end{array} \right), \end{displaymath} \pause where $\epsilon_i$ is statistically independent of $x_{i,1}$, $x_{i,2}$ and $x_{i,3}$. \end{frame} \begin{frame} \frametitle{Absorb $x_2$ and $x_3$} \begin{columns} % Use Beamer's columns to make narrower margins! \column{1.1\textwidth} Since $x_2$ and $x_3$ are not observed, they are absorbed by the intercept and error term. \pause {\small \begin{eqnarray*} y_i &=& \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \beta_2 x_{i,3} + \epsilon_i \\ \pause &=& (\beta_0 + \beta_2\mu_2 + \beta_3\mu_3) + \beta_1 x_{i,1} + (\beta_2 x_{i,2} + \beta_3 x_{i,3} - \beta_2\mu_2 - \beta_3\mu_3 + \epsilon_i) \\ \pause &=& \beta^*_0 + \beta_1 x_{i,1} + \epsilon^*_i. \end{eqnarray*} } % End size \pause And, \begin{displaymath} Cov(x_{i,1},\epsilon^*_i) = \beta_2\phi_{12} + \beta_3\phi_{13} \neq 0 \end{displaymath} \end{columns} \end{frame} \begin{frame} \frametitle{The ``True" Model} \framesubtitle{Almost always closer to the truth than the usual model, for observational data} \pause {\LARGE \begin{displaymath} y_i = \beta_0 + \beta_1 x_i + \epsilon_i, \end{displaymath} } % End Size \vspace{5mm} \pause where $E(x_i)=\mu_x$, $Var(x_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(x_i,\epsilon_i)=c$. \vspace{5mm} \pause Under this model, \begin{displaymath} \sigma_{xy} = Cov(x_i,y_i) = Cov(x_i,\beta_0 + \beta_1 x_i + \epsilon_i) = \beta_1 \sigma^2_x + c \end{displaymath} \end{frame} \begin{frame} \frametitle{Estimate $\beta_1$ as usual} \framesubtitle{Recalling $Cov(x_i,\epsilon_i)=c$} \begin{eqnarray*} b_1 &=& \frac{\sum_{i=1}^n(x_i-\overline{x})(y_i-\overline{y})} {\sum_{i=1}^n(x_i-\overline{x})^2} \\ \pause &=& \frac{\frac{1}{n}\sum_{i=1}^n(x_i-\overline{x})(y_i-\overline{y})} {\frac{1}{n}\sum_{i=1}^n(x_i-\overline{x})^2} \\ \pause &=& \frac{\widehat{\sigma}_{xy}}{\widehat{\sigma}^2_x} \\ \pause & \rightarrow & \frac{\sigma_{xy}}{\sigma^2_x} \mbox{~~ as } n \rightarrow \infty \\ \pause &=& \frac{\beta_1 \sigma^2_x + c}{\sigma^2_x} \\ \pause &=& \beta_1 + \frac{c}{\sigma^2_x} \pause \neq \beta_1 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$b_1 \rightarrow \beta_1 + \frac{c}{\sigma^2_x}$} \pause \begin{itemize} \item $b_1$ is inconsistent\pause, meaning it approaches the wrong target as $n \rightarrow \infty$. \pause \item It could be almost anything, depending on the value of $c$, the covariance between $x_i$ and $\epsilon_i$. \pause \item The only time $b_1$ behaves properly is when $c=0$. \pause \item Test $H_0: \beta_1=0$\pause, and the probability of Type I error goes to one as $n \rightarrow \infty$. \pause \item What if $\beta_1 < 0$ but $\beta_1 + \frac{c}{\sigma^2_x} > 0$\pause, and you test $H_0: \beta_1=0$? \end{itemize} \end{frame} \begin{frame} \frametitle{All this applies to multiple regression} \framesubtitle{Of course} \pause \emph{When a regression model fails to include all the independent variables that contribute to the dependent variable, and those omitted independent variables have non-zero covariance with variables that are in the model, the regression coefficients are biased and inconsistent}. \end{frame} \begin{frame} \frametitle{Correlation-Causation} \begin{itemize} \item The problem of omitted variables is the technical version of the correlation-causation issue. \pause \item The omitted variables are ``confounding" variables. \pause \item With random assignment and good procedure, $x$ and $\epsilon$ have zero covariance. \pause \item But random assignment is not always possible. \pause \item Most applications of regression to observational data provide very poor information about the regression coefficients. \pause \item Is bad information better than no information at all? \pause \end{itemize} \end{frame} \begin{frame} \frametitle{How about another estimation method?} \framesubtitle{Other than ordinary least squares} \pause \begin{itemize} \item Can \emph{any} other method be successful? \pause \item This is a very practical question, \pause because almost all regressions with observed (as opposed to manipulated) independent variables have the disease. \end{itemize} \end{frame} \begin{frame} \frametitle{For simplicity, assume normality} \framesubtitle{$y_i = \beta_0 + \beta_1 y_i + \epsilon_i$} \pause \begin{itemize} \item Assume $(x_i,\epsilon_i)$ are bivariate normal. \pause \item This makes $(x_i,y_i)$ bivariate normal. \pause \item $(x_1,y_1), \ldots, (x_n,y_n) \stackrel{i.i.d.}{\sim} N_2(\mathbf{m},\mathbf{V})$, \pause where \end{itemize} \begin{displaymath} \mathbf{m} = \left( \begin{array}{c} m_1 \\ m_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) \end{displaymath} \pause and \begin{displaymath} V = \left( \begin{array}{c c} v_{11} & v_{12} \\ & v_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right). \end{displaymath} \pause \begin{itemize} \item All you can ever learn from the data are the approximate values of $\mathbf{m}$ and $V$. \pause \item Even if you knew $\mathbf{m}$ and $V$ exactly, could you know $\beta_1$? \end{itemize} \end{frame} \begin{frame} \frametitle{Five equations in six unknowns} The parameter is $\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$. \pause The distribution of the data is determined by \pause \vspace{2mm} {\footnotesize \begin{displaymath} \left( \begin{array}{c} m_1 \\ m_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) ~~\mbox{ and }~~ \left( \begin{array}{c c} v_{11} & v_{12} \\ & v_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right) \end{displaymath} \pause } % End size \begin{itemize} \item $\mu_x=m_1$ and $\sigma^2_x=v_{11}$. \pause \item The remaining 3 equations in 4 unknowns have infinitely many solutions. \pause \item So infinitely many sets of parameter values yield the \emph{same probability distribution of the sample data}. \pause \item So how could you decide which one is correct based on the sample data? \pause \item The problem is fatal, if all you have is this data set. \pause \item Ultimately the solution is better data -- \emph{different} data. \end{itemize} \end{frame} \section{Instrumental Variables} \begin{frame} \frametitle{Instrumental Variables (Wright, 1928)} \framesubtitle{A partial solution} \pause \begin{itemize} \item An instrumental variable is a variable that is correlated with an explanatory variable, but is not correlated with any error terms and has no direct effect on the response variable. \pause \item Usually, the instrumental variable \emph{influences} the explanatory variable. \pause \item An instrumental variable is often not the main focus of attention; it's just a tool. \end{itemize} \end{frame} \begin{frame} \frametitle{A Simple Example} What is the contribution of income to credit card debt? \pause \begin{displaymath} y_i = \beta_0 + \beta_1 x_i + \epsilon_i, \end{displaymath} \pause where $E(x_i)=\mu_x$, $Var(x_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(x_i,\epsilon_i)=c$. \end{frame} \begin{frame} \frametitle{A path diagram} \pause \begin{displaymath} y_i = \alpha + \beta x_i + \epsilon_i, \end{displaymath} where $E(x_i)=\mu$, $Var(x_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(x_i,\epsilon_i)=c$. \pause \begin{center} \includegraphics[width=3in]{OmittedPath} \end{center} \pause Least squares estimate of $\beta$ is inconsistent, and so is every other possible estimate. \pause If the data are normal. \end{frame} \begin{frame} \frametitle{Add an instrumental variable} \framesubtitle{$x$ is income, $y$ is credit card debt.} \pause Focus the study on real estate agents in many cities. \pause Include median price of resale home $w_i$. \pause \begin{eqnarray*} x_i & = & \alpha_1 + \beta_1w_i +\epsilon_{i1} \\ \pause y_i & = & \alpha_2 + \beta_2x_i +\epsilon_{i2} \pause \end{eqnarray*} \begin{center} \includegraphics[width=4in]{InstruVar} \end{center} Main interest is in $\beta_2$. \end{frame} \begin{frame} \frametitle{Base estimation and inference on the covariance matrix} \framesubtitle{of $(w_i,x_i,y_i)$: Call it $V = [v_{ij}]$} \pause From $x_i = \alpha_1 + \beta_1w_i +\epsilon_{i1}$ and $y_i = \alpha_2 + \beta_2x_i +\epsilon_{i2}$, \pause \vspace{5mm} {\LARGE $V =$} \renewcommand{\arraystretch}{1.5} \begin{tabular}{|c|ccc|} \hline & $w$ & $x$ & $y$ \\ \hline $w$ & $\sigma^2_w$ & $\beta_1\sigma^2_w$ & $\beta_1\beta_2\sigma^2_w$ \\ $x$ & & $\beta_1^2\sigma^2_w+\sigma^2_1$ & $\beta_2(\beta_1^2\sigma^2_w+\sigma^2_1)+c$ \\ $y$ & & & $\beta_1^2\beta_2^2\sigma^2_w + \beta_2^2\sigma^2_1 + 2\beta_2c + \sigma^2_2$ \\ \hline \end{tabular} \pause \renewcommand{\arraystretch}{1.0} \vspace{2mm} \begin{displaymath} \beta_2 = \frac{v_{13}}{v_{12}} \end{displaymath} \pause The remaining 5 equations in 5 unknowns have unique solutions too. \end{frame} \begin{frame} \frametitle{A close look} %\framesubtitle{} The $v_{ij}$ are elements of the covariance matrix of the observable data. \pause \begin{displaymath} \beta_2 = \frac{v_{13}}{v_{12}} \pause = \frac{\beta_1\beta_2\sigma^2_w}{\beta_1\sigma^2_w} \pause = \frac{Cov(W,Y)}{Cov(W,X)} \end{displaymath} \pause \begin{itemize} \item $\widehat{v}_{ij}$ are sample variances and covariances. \pause \item $\widehat{v}_{ij} \stackrel{a.s.}{\rightarrow} v_{ij}$. \pause \item It is safe to assume $\beta_1 \neq 0$. \pause \item Because it's the connection between real estate prices and the income of real estate agents. \pause \item $\frac{\widehat{v}_{13}}{\widehat{v}_{12}}$ is a (strongly) consistent estimate of $\beta_2$. \pause \item $H_0: \beta_2=0$ is true if and only if $v_{13}=0$. \pause \item Test $H_0: v_{13} = 0$ by standard methods. \end{itemize} \end{frame} % Long hairy 2016 section on measurement error omitted in 2017. \begin{frame} \frametitle{Comments} %\framesubtitle{} \begin{itemize} \item Good instrumental variables are not easy to find. \pause % BMI and popularity (study in adopted children) % Twins raised apart. \item They will not just happen to be in the data set, except by a miracle. \pause \item They really have to come from another universe, but still have a strong and clear effect. \pause \item Wright's original example was tax policy for cooking oil. \pause \item Econometricians are good at this. \pause \item Time series applications are common. \pause \item Instrumental variables can help with measurement error in the explanatory variables too. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f17} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} \begin{frame} \frametitle{} % \framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%