% Omitted variables for SEM course. Based on a slide show from 2101f16, which continues with instrumental variables, going into measurement error too. % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Omitted Variables\footnote{See last slide for copyright information.}} \subtitle{STA431 Winter/Spring 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Omitted Variables} \begin{frame} \frametitle{A Practical Data Analysis Problem} When more explanatory variables are added to a regression model \pause and these additional explanatory variables are correlated with explanatory variables already in the model \pause (as they usually are in an observational study), \pause \begin{itemize} \item Statistical significance can appear when it was not present originally. \pause \item Statistical significance that was originally present can disappear. \pause \item Even the signs of the $\widehat{\beta}$s can change, reversing the interpretation of how their variables are related to the response variable. \end{itemize} \end{frame} \begin{frame} \frametitle{An extreme, artificial example} \framesubtitle{To make a point} \pause Suppose that in a certain population, the correlation between age and strength is $r=-0.93$. \pause \begin{center} \includegraphics[width=2.8in]{Chimps} \end{center} \end{frame} \begin{frame} \frametitle{The fixed $x$ regression model} \begin{displaymath} Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_k x_{i,p-1} + \epsilon_i, \mbox{ with } \epsilon_i \sim N(0,\sigma^2) \end{displaymath} \pause \vspace{5mm} \begin{itemize} \item If viewed as conditional on $\mathbf{X}_i = \mathbf{x}_i$, this model implies independence of $\epsilon_i$ and $\mathbf{X}_i$, \pause because the conditional distribution of $\epsilon_i$ given $\mathbf{X}_i = \mathbf{x}_i$ does not depend on $\mathbf{x}_i$. \pause \item What is $\epsilon_i$? \emph{Everything else} that affects $Y_i$. \pause \item So the usual model says that if the explanatory varables are random, they have \emph{zero covariance} with all other variables that are related to $Y_i$, but are not included in the model. \pause \item For observational data (no random assignment), this assumption is almost always violated. \pause \item Does it matter? \end{itemize} \end{frame} \begin{frame} \frametitle{Example: $Y_i = \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_3 X_{i,3} + \epsilon_i$} \framesubtitle{As usual, the explanatory variables are random.} \pause Suppose that the variables $X_2$ and $X_3$ affect $Y$ and are correlated with $X_1$, but they are not part of the data set. \pause \begin{center} \includegraphics[width=3in]{OmittedPath1} \end{center} \end{frame} \begin{frame} \frametitle{Statement of the model} \framesubtitle{The explanatory variables $X_2$ and $X_3$ affect $Y$ and are correlated with $X_1$, but they are not part of the data set.} \pause The values of the response variable are generated as follows: \pause \begin{displaymath} Y_i = \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_3 X_{i,3} + \epsilon_i, \end{displaymath} independently for $i= 1, \ldots, n$, where $\epsilon_i \sim N(0,\sigma^2)$. \pause The explanatory variables are random, with expected value and variance-covariance matrix \pause \begin{displaymath} E\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right) = \left( \begin{array}{c} \mu_1 \\ \mu_2 \\ \mu_3 \end{array} \right) \mbox{ ~and~ } \pause cov\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \\ X_{i,3} \end{array} \right) = \left( \begin{array}{rrr} \phi_{11} & \phi_{12} & \phi_{13} \\ & \phi_{22} & \phi_{23} \\ & & \phi_{33} \end{array} \right), \end{displaymath} \pause where $\epsilon_i$ is independent of $X_{i,1}$, $X_{i,2}$ and $X_{i,3}$. \pause Values of the variables $X_{i,2}$ and $X_{i,3}$ are latent, and are not included in the data set. \end{frame} \begin{frame} \frametitle{Absorb $X_2$ and $X_3$} \begin{columns} % Use Beamer's columns to make narrower margins! \column{1.1\textwidth} Since $X_2$ and $X_3$ are not observed, they are absorbed by the intercept and error term. \pause {\small \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \beta_3 X_{i,3} + \epsilon_i \\ \pause &=& (\beta_0 + \beta_2\mu_2 + \beta_3\mu_3) + \beta_1 X_{i,1} + (\beta_2 X_{i,2} + \beta_3 X_{i,3} - \beta_2\mu_2 - \beta_3\mu_3 + \epsilon_i) \\ \pause &=& \beta^\prime_0 + \beta_1 X_{i,1} + \epsilon^\prime_i. \end{eqnarray*} } % End size \pause And, \begin{displaymath} Cov(X_{i,1},\epsilon^\prime_i) = \beta_2\phi_{12} + \beta_3\phi_{13} \neq 0 \end{displaymath} % Homework \end{columns} \end{frame} \begin{frame} \frametitle{The ``True" Model} \framesubtitle{Almost always closer to the truth than the usual model, for observational data} \pause {\LARGE \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} \pause } % End Size \vspace{5mm} where $E(X_i)=\mu_x$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \vspace{5mm} \pause Under this model, \begin{displaymath} \sigma_{xy} = Cov(X_i,Y_i) = Cov(X_i,\beta_0 + \beta_1 X_i + \epsilon_i) = \beta_1 \sigma^2_x + c \end{displaymath} % Homework \end{frame} \begin{frame} \frametitle{Estimate $\beta_1$ as usual with least squares} \framesubtitle{Recall $Cov(X_i,Y_i) = \sigma_{xy} = \beta_1 \sigma^2_x + c$} \pause \begin{eqnarray*} \widehat{\beta}_1 &=& \frac{\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\sum_{i=1}^n(X_i-\overline{X})^2} \\ \pause &=& \frac{\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})} {\frac{1}{n}\sum_{i=1}^n(X_i-\overline{X})^2} \\ \pause &=& \frac{\widehat{\sigma}_{xy}}{\widehat{\sigma}^2_x} \\ \pause &\stackrel{p}{\rightarrow}& \frac{\sigma_{xy}}{\sigma^2_x}\\ \pause &=& \frac{\beta_1 \sigma^2_x + c}{\sigma^2_x} \\ \pause &=& \beta_1 + \frac{c}{\sigma^2_x} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\widehat{\beta}_1 \stackrel{p}{\rightarrow} \beta_1 + \frac{c}{\sigma^2_x}$} \framesubtitle{It converges to the wrong thing.} \pause \begin{itemize} \item $\widehat{\beta}_1$ is inconsistent. \pause \item For large samples it could be almost anything, depending on the value of $c$, the covariance between $X_i$ and $\epsilon_i$. \pause \item Small sample estimates could be accurate, but only by chance. \pause \item The only time $\widehat{\beta}_1$ behaves properly is when $c=0$. \pause \item Test $H_0: \beta_1=0$: Probability of making a Type I error goes to one as $n \rightarrow \infty$. % \pause % \item What if $\beta_1 < 0$ but $\beta_1 + \frac{c}{\sigma^2_x} > 0$, \pause % and you test $H_0: \beta_1=0$? \end{itemize} \end{frame} \begin{frame} \frametitle{All this applies to multiple regression} \framesubtitle{Of course} \pause \emph{When a regression model fails to include all the explanatory variables that contribute to the response variable, and those omitted explanatory variables have non-zero covariance with variables that are in the model, the regression coefficients are inconsistent. \pause Estimation and inference are almost guaranteed to be misleading, especially for large samples.} \end{frame} \begin{frame} \frametitle{Correlation-Causation} \begin{itemize} \item The problem of omitted variables is a technical aspect of the correlation-causation issue. \pause \item The omitted variables are ``confounding" variables. \pause \item With random assignment and good procedure, $x$ and $\epsilon$ have zero covariance. \pause \item But random assignment is not always possible. \pause \item Most applications of regression to observational data provide very poor information about the regression coefficients. \pause \item Is bad information better than no information at all? \end{itemize} \end{frame} \begin{frame} \frametitle{How about another estimation method?} \framesubtitle{Other than ordinary least squares} \pause \begin{itemize} \item Can \emph{any} other method be successful? \pause \item This is a very practical question, \pause because almost all regressions with observational data have the disease. \end{itemize} \end{frame} \begin{frame} \frametitle{For simplicity, assume normality} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$} \pause \begin{itemize} \item Assume $(X_i,\epsilon_i)$ are bivariate normal. \pause \item This makes $(X_i,Y_i)$ bivariate normal. \pause \item $(X_1,Y_1), \ldots, (X_n,Y_n) \stackrel{i.i.d.}{\sim} N_2(\boldsymbol{\mu},\boldsymbol{\Sigma})$, \pause where \end{itemize} \begin{displaymath} \boldsymbol{\mu} = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) \end{displaymath} \pause and \begin{displaymath} \boldsymbol{\Sigma} = \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ & \sigma_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right). \end{displaymath} \pause % Homework \begin{itemize} \item All you can ever learn from the data are the approximate values of $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$. \pause \item Even if you knew $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ exactly, could you know $\beta_1$? \end{itemize} \end{frame} \begin{frame} \frametitle{Five equations in six unknowns} The parameter is $\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$. \pause The distribution of the data is determined by \pause \vspace{2mm} {\footnotesize \begin{displaymath} \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) = \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) ~~\mbox{ and }~~ \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ & \sigma_{22} \end{array} \right) = \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right) \end{displaymath} \pause } % End size \begin{itemize} \item $\mu_x=\mu_1$ and $\sigma^2_x=\sigma_{11}$. \pause \item The remaining 3 equations in 4 unknowns have infinitely many solutions. \pause \item So infinitely many sets of parameter values yield the \emph{same distribution of the sample data}. \pause \item This is serious trouble -- lack of parameter identifiability. \pause \item \emph{Definition}: If a parameter is a function of the distribution of the observable data, it is said to be \emph{identifiable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Showing identifiability} %\framesubtitle{} \emph{Definition}: If a parameter is a function of the distribution of the observable data, it is said to be identifiable. \begin{itemize} \item How could a parameter be a function of a distribution? \pause \item $D \sim F_\theta$ and $\theta = g(F_\theta)$ \pause \item Usually $g$ is defined in terms of moments. \pause \item Example: $F_\theta(x) = 1-e^{-\theta x}$ \pause and $f_\theta(x) = \theta e^{-\theta x}$. \pause \end{itemize} \begin{eqnarray*} f_\theta(x) & = & \frac{d}{dx} \, F_\theta(x) \\ \pause E(X) & = & \int_0^\infty x f_\theta(x) \, dx \pause = \frac{1}{\theta} \\ \pause \theta & = & \frac{1}{E(X)} \pause \end{eqnarray*} Sometimes people use moment-generating functions or characteristic functions instead of just moments. \end{frame} \begin{frame} \frametitle{Showing identifiability is like Method of Moments Estimation} \pause %\framesubtitle{} \begin{itemize} \item The distribution of the data is always a function of the parameters. \pause \item The moments are always a function of the distribution of the data. \pause \item If the parameters can be expressed as a function of the moments, \pause \begin{itemize} \item Put hats on to obtain MOM estimates, or observe that \pause \item The parameter is a function of the distribution, and is identifiable. \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Back to the five equations in six unknowns} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$} \pause $\mathbf{D}_i = \left( \begin{array}{c} X_i \\ Y_i \end{array} \right) \sim N_2(\boldsymbol{\mu},\boldsymbol{\Sigma})$, where \pause \vspace{2mm} \begin{displaymath} \begin{array}{ccccl} \boldsymbol{\mu} & = & \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) & = & \left( \begin{array}{c} \mu_x \\ \beta_0+\beta_1\mu_x \end{array} \right) \\ \pause &&&& \\ \boldsymbol{\Sigma} & = & \left( \begin{array}{c c} \sigma_{11} & \sigma_{12} \\ \cdot & \sigma_{22} \end{array} \right) \pause & = & \left( \begin{array}{c c} \sigma^2_x & \beta_1\sigma^2_x+c \\ \cdot & \beta_1^2\sigma^2_x + 2 \beta_1c + \sigma^2_\epsilon \end{array} \right) \end{array} \pause \end{displaymath} \vspace{5mm} We have expressed the moments in terms of the parameters, \pause but we can't solve for $\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$. \end{frame} \begin{frame} \frametitle{Skipping the High School algebra} \framesubtitle{$\theta = (\mu_x, \sigma^2_x, \sigma^2_\epsilon, c, \beta_0, \beta_1)$} \pause \begin{itemize} \item For \emph{any} given $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, all the points in a one-dimensional subset of the 6-dimensional parameter space yield $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$, and hence the same distribution of the sample data. \pause \item In that subset, values of $\beta_1$ range from $-\infty$ to $-\infty$, \pause so $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ could have been produced by \emph{any} value of $\beta_1$. \pause \item There is no way to distinguish between the possible values of $\beta_1$ based on sample data. \pause \item The problem is fatal, \pause if all you can observe is $X$ and $Y$. \pause \item See text for details. % There was originally a slide entitled "Details for the record" but it was needlessly complicated and actually wrong in one place. It is cleaner to parameterize in terms of c. \end{itemize} \end{frame} \section{Instrumental Variables} \begin{frame} \frametitle{Instrumental Variables (Wright, 1928)} \framesubtitle{A partial solution} \pause \begin{itemize} \item An instrumental variable is a variable that is correlated with an explanatory variable, but is not correlated with any error terms and has no direct connection to the response variable. \pause \item In Econometrics, the instrumental variable usually \emph{influences} the explanatory variable. \pause \item An instrumental variable is often not the main focus of attention; it's just a tool. \end{itemize} \end{frame} \begin{frame} \frametitle{A Simple Example} What is the contribution of income to credit card debt? \pause \begin{displaymath} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i, \end{displaymath} \pause where $E(X_i)=\mu_x$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \end{frame} \begin{frame} \frametitle{A path diagram} \pause \begin{displaymath} Y_i = \alpha + \beta X_i + \epsilon_i, \end{displaymath} \vspace{1mm} where $E(X_i)=\mu$, $Var(X_i)=\sigma^2_x$, $E(\epsilon_i)=0$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $Cov(X_i,\epsilon_i)=c$. \pause \begin{center} \includegraphics[width=3in]{OmittedPath2} \end{center} \pause Least squares estimate of $\beta$ is inconsistent, and so is every other possible estimate. \pause This is strictly true if the data are normal. \end{frame} \begin{frame} \frametitle{Add an instrumental variable} \pause Definition: An instrumental variable for an explanatory variable is another random variable that has non-zero covariance with the explanatory variable, and \emph{no direct connection with any other variable in the model.} \pause \vspace{5mm} % The standard notation for instrumental variables is Z. I'm fixing this in the text, and should do it here too. Use graphics files from the text. Focus the study on real estate agents in many cities. \pause Include median price of resale home. \pause \begin{itemize} \item $X$ is income. \item $Y$ is credit card debt. \item $W$ is median price of resale home. \end{itemize}\pause \begin{eqnarray*} X_i & = & \alpha_1 + \beta_1W_i +\epsilon_{i1} \\ \pause Y_i & = & \alpha_2 + \beta_2X_i +\epsilon_{i2} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Picture} \framesubtitle{$W_i$ is median price of resale home, $X_i$ is income, $Y_i$ is credit card debt.} \pause \begin{eqnarray*} X_i & = & \alpha_1 + \beta_1W_i +\epsilon_{i1} \\ \pause Y_i & = & \alpha_2 + \beta_2X_i +\epsilon_{i2} \pause \end{eqnarray*} \begin{center} \includegraphics[width=4in]{InstruVar} \end{center} \pause Main interest is in $\beta_2$. \end{frame} \begin{frame} \frametitle{Calculate the covariance matrix} % Homework \framesubtitle{of the observable data $(W_i,X_i,Y_i)$: Call it $\boldsymbol{\Sigma} = [\sigma_{ij}]$} \pause From $X_i = \alpha_1 + \beta_1W_i +\epsilon_{i1}$ and $Y_i = \alpha_2 + \beta_2X_i +\epsilon_{i2}$, \pause \vspace{5mm} {\LARGE $\boldsymbol{\Sigma} =$} \renewcommand{\arraystretch}{1.5} \begin{tabular}{|c|ccc|} \hline & $W$ & $X$ & $Y$ \\ \hline $W$ & $\sigma^2_w$ & $\beta_1\sigma^2_w$ & $\beta_1\beta_2\sigma^2_w$ \\ $X$ & $\cdot$ & $\beta_1^2\sigma^2_w+\sigma^2_1$ & $\beta_2(\beta_1^2\sigma^2_w+\sigma^2_1)+c$ \\ $Y$ & $\cdot$ & $\cdot$ & $\beta_1^2\beta_2^2\sigma^2_w + \beta_2^2\sigma^2_1 + 2\beta_2c + \sigma^2_2$ \\ \hline \end{tabular} \pause \renewcommand{\arraystretch}{1.0} \vspace{5mm} \begin{displaymath} \beta_2 = \frac{\sigma_{13}}{\sigma_{12}} \end{displaymath} \end{frame} % STA2101f16 continues from here, starting with % "But of course there is measurement error." \begin{frame} \frametitle{Parameter estimation} \framesubtitle{$X_i = \alpha_1 + \beta_1W_i +\epsilon_{i1}$ and $Y_i = \alpha_2 + \beta_2X_i +\epsilon_{i2}$} $\boldsymbol{\Sigma} =$ {\footnotesize \renewcommand{\arraystretch}{1.5} \begin{tabular}{|c|ccc|} \hline & $W$ & $X$ & $Y$ \\ \hline $W$ & $\sigma^2_w$ & $\beta_1\sigma^2_w$ & $\beta_1\beta_2\sigma^2_w$ \\ $X$ & $\cdot$ & $\beta_1^2\sigma^2_w+\sigma^2_1$ & $\beta_2(\beta_1^2\sigma^2_w+\sigma^2_1)+c$ \\ $Y$ & $\cdot$ & $\cdot$ & $\beta_1^2\beta_2^2\sigma^2_w + \beta_2^2\sigma^2_1 + 2\beta_2c + \sigma^2_2$ \\ \hline \end{tabular} \pause \renewcommand{\arraystretch}{1.0} } % End size $ \beta_2 = \frac{\sigma_{13}}{\sigma_{12}}$. \pause % \vspace{2mm} \begin{itemize} \item All the other parameters are identifiable too. \pause % Homework \item The instrumental variable saved us. \pause \item There are 9 model parameters, and 9 moments in $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$. \pause \item The invariance principle yields explicit formulas for the MLEs. \pause \item If the data are normal, MLEs equal the Method of Moments estimates because they are both 1-1 with the moments. % Homework \end{itemize} \end{frame} \begin{frame} \frametitle{More Comments} %\framesubtitle{} \begin{itemize} \item Of course there is measurement error. \pause \item Instrumental variables help with measurement error as well as with omitted variables. \pause \item More later. \pause \item Good instrumental variables are not easy to find. \pause \item They will not just happen to be in the data set, except by a miracle. \pause \item They really have to come from another universe, \pause but still have a strong and clear connection to the explanatory variable. \pause \item Data collection has to be \emph{planned}. \pause \item Wright's original example was tax policy for cooking oil. \pause \item Time series applications are common, but not in this course. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s17} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/431s17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example} \end{frame} \begin{frame} \frametitle{} % \framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } % End Size %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi ######## First IV model, with no latent variables except errors ######## # xi' = (W,epsilon1,epsilon2) and eta' = (X,Y) B = ZeroMatrix(2,2) # beta B[1,0] = var('beta2') ; B P = ZeroMatrix(3,3) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11'); P[2,2] = var('sigma22') P[1,2] = var('c'); P[2,1] = var('c'); P G = ZeroMatrix(2,3) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G pickout = 3,1,2 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Agrees with hand calculations ######## Second IV model, with true income ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (T,X,Y) and xi' = (W,epsilon1,epsilon2,epsilon3) B = ZeroMatrix(3,3) # beta B[1,0] = var('beta2') ; B[2,0] = var('beta3') ; B P = ZeroMatrix(4,4) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11') P[2,2] = var('sigma22'); P[3,3] = var('sigma33'); P[2,3] = var('c'); P[3,2] = var('c'); P G = ZeroMatrix(3,4) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G[2,3] = 1; G pickout = 4,2,3 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace notation # Play with nasty P = SymmetricMatrix(4,'sigma'); P[0,0]=var('sigma00') P[0,1]=0; P[0,2]=0; P[0,3]=0 P[1,0]=0; P[2,0]=0; P[3,0]=0; P V = EqsVar(B,G,P,pickout); V %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \left(\begin{array}{rrr} \sigma_{00} & \beta_{1} \beta_{2} \sigma_{00} & \beta_{1} \beta_{3} \sigma_{00} \\ \beta_{1} \beta_{2} \sigma_{00} & \beta_{1}^{2} \beta_{2}^{2} \sigma_{00} + \beta_{2}^{2} \sigma_{11} + \sigma_{22} & \beta_{1}^{2} \beta_{2} \beta_{3} \sigma_{00} + \beta_{2} \beta_{3} \sigma_{11} + c \\ \beta_{1} \beta_{3} \sigma_{00} & \beta_{1}^{2} \beta_{2} \beta_{3} \sigma_{00} + \beta_{2} \beta_{3} \sigma_{11} + c & \beta_{1}^{2} \beta_{3}^{2} \sigma_{00} + \beta_{3}^{2} \sigma_{11} + \sigma_{33} \end{array}\right) ######## Third IV model, with true income and debt, correlated errors ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (Tx,Ty,X,Y) and xi' = (epsilon1,epsilon2,epsilon3,epsilon4,W) B = ZeroMatrix(4,4) # beta B[1,0] = var('beta3') ; B[2,0] = var('beta2') ; B[3,1] = var('beta4'); B P = SymmetricMatrix(5,'sigma'); P[4,4]=var('sigma00') # No correlations between W and the errors for j in interval(0,3): P[j,4] = 0 P[4,j] = 0 P G = ZeroMatrix(4,5) # gamma G[0,0] = 1; G[0,4] = var('beta1'); G[1,1] = 1; G[2,3] = 1; G[3,2] = 1; G pickout = 9,3,4 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace sigma00 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \left(\begin{array}{ccc} \sigma^2_w & \beta_{1} \beta_{2} \sigma^2_w & \beta_{1} \beta_{3} \beta_{4} \sigma^2_w \\ \beta_{1} \beta_{2} \sigma^2_w & \beta_{1}^{2} \beta_{2}^{2} \sigma^2_w + \beta_{2}^{2} \sigma_{11} + 2 \, \beta_{2} \sigma_{14} + \sigma_{44} & \beta_{1}^{2} \beta_{2} \beta_{3} \beta_{4} \sigma^2_w + \beta_{2} \beta_{3} \beta_{4} \sigma_{11} + \beta_{2} \beta_{4} \sigma_{12} + \beta_{3} \beta_{4} \sigma_{14} + \beta_{2} \sigma_{13} + \beta_{4} \sigma_{24} + \sigma_{34} \\ \beta_{1} \beta_{3} \beta_{4} \sigma^2_w & \beta_{1}^{2} \beta_{2} \beta_{3} \beta_{4} \sigma^2_w + \beta_{2} \beta_{3} \beta_{4} \sigma_{11} + \beta_{2} \beta_{4} \sigma_{12} + \beta_{3} \beta_{4} \sigma_{14} + \beta_{2} \sigma_{13} + \beta_{4} \sigma_{24} + \sigma_{34} & \beta_{1}^{2} \beta_{3}^{2} \beta_{4}^{2} \sigma^2_w + \beta_{3}^{2} \beta_{4}^{2} \sigma_{11} + 2 \, \beta_{3} \beta_{4}^{2} \sigma_{12} + 2 \, \beta_{3} \beta_{4} \sigma_{13} + \beta_{4}^{2} \sigma_{22} + 2 \, \beta_{4} \sigma_{23} + \sigma_{33} \end{array}\right) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sigma_{00} OLD2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ######## Second IV model, with true income ######## sem = 'http://www.utstat.toronto.edu/~brunner/openSEM/sage/sem.sage' load(sem) # In EsqVar, eta = beta eta + gamma xi, with cov(xi) = Phi # eta' = (T,X,Y) and xi' = (W,epsilon1,epsilon2,epsilon3) B = ZeroMatrix(3,3) # beta B[1,0] = var('beta2') ; B[2,0] = var('beta3') ; B P = ZeroMatrix(4,4) # Phi P[0,0] = var('sigma00'); P[1,1] = var('sigma11') P[2,2] = var('sigma22'); P[3,3] = var('sigma33'); P[2,3] = var('c'); P[3,2] = var('c'); P G = ZeroMatrix(3,4) # gamma G[0,0] = var('beta1'); G[0,1] = 1; G[1,2] = 1; G[2,3] = 1; G pickout = 4,2,3 # Indices of observable variables, order eta, xi V = EqsVar(B,G,P,pickout); V print(latex(V)) # Search and replace notation # Play with nasty P = SymmetricMatrix(4,'sigma'); P[0,0]=var('sigma00') P[0,1]=0; P[0,2]=0; P[0,3]=0 P[1,0]=0; P[2,0]=0; P[3,0]=0; P V = EqsVar(B,G,P,pickout); V \begin{eqnarray*} m_2 & = & \beta_0+\beta_1 m_1 \\ v_{12} & = & \beta_1 v_{11} + c \\ v_{22} & = & \beta_1^2\ v_{11} + 2 \beta_1c + \sigma^2_\epsilon \end{eqnarray*}