% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{AnnArbor} % CambridgeUS % I'm using this one (yellow) just to be different from Dehan. \usepackage[english]{babel} \usepackage{graphpap} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode\pagebreak % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Introduction to Regression with Measurement Error\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Measurement Error} \begin{frame} \frametitle{Measurement Error} %\framesubtitle{} \begin{itemize} \item Snack food consumption \item Exercise \item Income \item Cause of death (classification error) \pause \item Even amount of drug that reaches animal’s blood stream in an experimental study. \item Is there anything that is \emph{not} measured with error? \end{itemize} \end{frame} \begin{frame} \frametitle{Additive measurement error} \framesubtitle{A very simple model} {\LARGE \begin{displaymath} W = X + e \end{displaymath} } % End size Where $E(X)=\mu_x$, $E(e)=0$, $Var(X)=\sigma^2_x$, $Var(e)=\sigma^2_e$, and $Cov(X,e)=0$. \pause \begin{center} \includegraphics[width=1.5in]{Additive} \end{center} \end{frame} \begin{frame} \frametitle{Variance and Covariance} \framesubtitle{$W = X + e$} \begin{eqnarray*} Var(W) &=& Var(X) + Var(e) \\ &=& \sigma^2_x + \sigma^2_e \\ &&\\ \pause Cov(X,W) &=& Cov(X, \, X+e) \\ &=& Cov(X,X) + Cov(X,e) \\ \ &=& \sigma^2_x \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Explained Variance} %\framesubtitle{} \begin{itemize} \item Variance is an index of unit-to-unit variation in a measurement. \item Explaining unit-to-unit variation is an important goal of Science. \pause \item How much of the variation in an observed variable comes from variation in the latent quantity of interest, and how much comes from random noise? \end{itemize} \end{frame} \section{Reliability} \begin{frame} \frametitle{Definition of Reliability} %\framesubtitle{} Reliability is the squared correlation between the observed variable and the latent variable (true score). \end{frame} \begin{frame} \frametitle{Calculation of Reliability} \framesubtitle{Squared correlation between observed and true score} \begin{eqnarray*} \rho^2 \pause &=& \left(\frac{Cov(X,W)}{SD(X) SD(W)}\right)^2 \pause \\ &=& \left(\frac{\sigma^2_x}{\sqrt{\sigma^2_x} \sqrt{\sigma^2_x+\sigma^2_e}}\right)^2 \pause \\ &=& \frac{\sigma^4_x}{\sigma^2_x (\sigma^2_x+\sigma^2_e)} \pause \\ &=& \frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e}. \pause \end{eqnarray*} % Have a phi-omega version in OpenSEM work. Reliability is the proportion of the variance in the observed variable that comes from the latent variable of interest, and not from random error. \end{frame} \begin{frame} \frametitle{How to estimate reliability from data} %\framesubtitle{} \begin{itemize} \item Correlate usual measurement with ``Gold Standard?" \item Not very realistic, except maybe for some bio-markers. \pause \item One answer: Measure twice. \end{itemize} \end{frame} \begin{frame} \frametitle{Measure twice} \framesubtitle{Called ``equivalent measurements" because {\color{red}error variance is the same}} \begin{eqnarray} W_1 & = & X + e_1 \nonumber \\ W_2 & = & X + e_2, \nonumber \end{eqnarray} where $E(X)=\mu_x$, $Var(X)=\sigma^2_x$, $E(e_1)=E(e_2)=0$, {\color{red}$Var(e_1)=Var(e_2)=\sigma^2_e$}, and $X$, $e_1$ and $e_2$ are all independent. \pause \vspace{3mm} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(157,50){\framebox{$W_1$}} \put(232,50){\framebox{$W_2$}} \put(197,15){\vector(-1,1){25}} % X -> W1 \put(209,15){\vector(1,1){25}} % X -> W2 \put(161,95){$e_1$} % x = V2+4 \put(165,90){\vector(0,-1){25}} % e1 -> W1 \put(236,95){$e_2$} % x = V3+4 \put(240,90){\vector(0,-1){25}} % e2 -> W2 \end{picture} \end{center} \end{frame} \begin{frame} \frametitle{Reliability equals the correlation between two equivalent measurements} \pause \framesubtitle{This is a population correlation} {\footnotesize \begin{eqnarray*} Corr(W_1,W_2) & = & \frac{Cov(W_1,W_2)}{SD(W_1)SD(W_2)} \\ \pause & & \\ & = & \frac{Cov(X+e_1, \, X+e_2) }{\sigma^2_x+\sigma^2_e} \\ \pause & & \\ & = & \frac{Cov(X,X)+0+0+0} {\sigma^2_x+\sigma^2_e} \\ \pause & & \\ & = & \frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e}, \end{eqnarray*} which is the reliability. } % End size \end{frame} \begin{frame} \frametitle{Estimate the reliability: Measure twice for a sample of size $n$} \framesubtitle{With a well-chosen time gap} Calculate $r = \frac{\sum_{i=1}^n (W_{i1}-\overline{W}_1)(W_{i2}-\overline{W}_2)} {\sqrt{\sum_{i=1}^n (W_{i1}-\overline{W}_1)^2} \sqrt{\sum_{i=1}^n (W_{i2}-\overline{W}_2)^2}}$. \pause \vspace{5mm} % HW prove consistent \begin{itemize} \item Test-retest reliability \item Alternate forms reliability \item Split-half reliability \end{itemize} \end{frame} \begin{frame} \frametitle{Omitted variables can cause correlated measurement error} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,20) % Size of picture (does not matter), origin % \graphpaper(150,20)(120,120) % (x,y) of Lower left, (width,height) \put(197,000){$X$} \put(202,4){\circle{20}} \put(157,50){\framebox{$W_1$}} \put(232,50){\framebox{$W_2$}} \put(197,15){\vector(-1,1){25}} % X -> W1 \put(209,15){\vector(1,1){25}} % X -> W2 \put(161,95){$e_1$} % x = V2+4 \put(165,90){\vector(0,-1){25}} % e1 -> W1 \put(236,95){$e_2$} % x = V3+4 \put(240,90){\vector(0,-1){25}} % e2 -> W2 % Add correlated measurement error % Lining up the ends of the oval with the e1 and e2 arrows \put(202.5,105){\oval(75,60)[t]} % (x,y) location, (width,height) [top] % Put arrow heads on the oval \put(165,110){\vector(0,-1){5}} \put(240,110){\vector(0,-1){5}} \end{picture} \pause \end{center} \vspace{10mm} Leading to an over-estimate of reliability. % HW: Show it \end{frame} \section{Consequences of Ignoring Measurement Error} \begin{frame} \frametitle{Measurement error in regression analysis} %\framesubtitle{} \begin{itemize} \item Mostly we are interested in relationships between latent (true) variables. \item But all we have at best are the true variables measured with error. \item Models like $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_kX_{ik} + \epsilon_i$ are mis-specified. \pause \item The most common way of dealing with measurement error in regression is to ignore it. \item What effect does this have on estimation and inference? \pause \item First consider ignoring measurement error just in the response variable. \end{itemize} \end{frame} \begin{frame} \frametitle{Measurement error in the response variable } %\framesubtitle{} \begin{center} \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$Y$} \put(202,4){\circle{20}} \put(157,50){\framebox{$X$}} % \put(168,25){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(182,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(235,50){\framebox{$V$}} \put(167,42){\vector(1,-1){25}} % X -> Y \put(212,17){\vector(1,1){25}} % Y -> V \put(240,95){$e$} \put(243,90){\vector(0,-1){25}} % e -> V \put(244,01){$\epsilon$} \put(242,03){\vector(-1,0){25}} % e -> V \end{picture} \pause \end{center} True model: \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ V_i &=& \nu + Y_i + e_i \end{eqnarray*} Naive model: $V_i = \beta_0 + \beta_1 X_i + \epsilon_i$ \end{frame} \begin{frame} \frametitle{Is $\widehat{\beta}_1$ consistent?} \framesubtitle{Ignoring measurement error in $Y$} First calculate $Cov(X_i,V_i)$. Under the true model \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ V_i &=& \nu + Y_i + e_i, \end{eqnarray*} \pause \begin{eqnarray*} Cov(X_i,V_i) &=& Cov(X, \, \beta_1 X_i + \epsilon_i) \\ \pause &=& \beta_1 \sigma^2_x \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Target of $\widehat{\beta}_1$ as $n \rightarrow \infty$} \framesubtitle{Have $Cov(X_i,V_i) = \beta_1 \sigma^2_x$ and $Var(X_i) = \sigma^2_x$} \pause \begin{eqnarray*} \widehat{\beta}_1 &=& \frac{\sum_{i=1}^n(X_i-\overline{X})(V_i-\overline{V})} {\sum_{i=1}^n(X_i-\overline{X})^2} \\ \pause &=& \frac{\widehat{\sigma}_{x,v}}{\widehat{\sigma}^2_x} \\ \pause &\stackrel{p}{\rightarrow}& \frac{Cov(X_i,V_i)}{Var(X_i)} \\ \pause &=& \frac{\beta_1 \sigma^2_x}{\sigma^2_x} \\ \pause &=& \beta_1 \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Why did it work?} \pause %\framesubtitle{} \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ V_i &=& \nu + Y_i + e \\ \pause &=& \nu + (\beta_0 + \beta_1 X_i + \epsilon_i) + e_i \\ \pause &=& (\nu + \beta_0) + \beta_1 X_i + (\epsilon_i + e_i) \\ \pause &=& \beta_0^\prime + \beta_1 X_i + \epsilon_i^\prime \pause \end{eqnarray*} \begin{itemize} \item This is a re-parameterization. \pause \item Most definitely \emph{not} one-to-one. \pause \item $(\nu,\beta_0)$ is absorbed into $\beta_0^\prime$. \item $(\epsilon_i, e_i)$ is absorbed into $\epsilon_i^\prime$. \pause \item Can't know everything, but all we care about is $\beta_1$ anyway. \end{itemize} \end{frame} \begin{frame} \frametitle{Don't Worry} \pause {\Large \begin{itemize} \item If a response variable appears to have no measurement error, assume it does have measurement error but the problem has been re-parameterized. \pause \item Measurement error in $Y$ is part of $\epsilon$. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Measurement error in a single explanatory variable} %\framesubtitle{} \begin{center} % Path diagram: Had to fiddle with this! \begin{picture}(100,100)(150,0) % Size of picture (does not matter), origin \put(197,000){$X$} \put(202,4){\circle{20}} \put(210,30){{\footnotesize $\beta_1$}} % Label the arrow X -> Y \put(157,50){\framebox{$W$}} \put(232,50){\framebox{$Y$}} \put(197,15){\vector(-1,1){25}} % X -> W \put(209,15){\vector(1,1){25}} % X -> Y \put(161,95){$e$} \put(165,90){\vector(0,-1){25}} % e -> W \put(236,95){$\epsilon$} \put(240,90){\vector(0,-1){25}} % epsilon -> Y \end{picture} \end{center} True model: \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_i + \epsilon_i \\ W_i &=& X_i + e_i, \end{eqnarray*} Naive model: $Y_i = \beta_0 + \beta_1 W_i + \epsilon_i$ \end{frame} \begin{frame} \frametitle{Target of $\widehat{\beta}_1$ as $n \rightarrow \infty$} \framesubtitle{$Y_i = \beta_0 + \beta_1 X_i + \epsilon_i$ and $W_i = X_i + e_i$} Have $Cov(W_i,Y_i) = \beta_1 \sigma^2_x$ and $Var(W_i) = \sigma^2_x+\sigma^2_e$ \pause \begin{eqnarray*} \widehat{\beta}_1 &=& \frac{\sum_{i=1}^n(W_i-\overline{W})(Y_i-\overline{Y})} {\sum_{i=1}^n(W_i-\overline{W})^2} \\ \pause &=& \frac{\widehat{\sigma}_{w,y}}{\widehat{\sigma}^2_w}\nonumber \\ \pause &\stackrel{p}{\rightarrow}& \frac{Cov(W_i,Y_i)}{Var(W_i)} \\ \pause &=& \beta_1 \left(\frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e} \right) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{$\widehat{\beta}_1 \stackrel{p}{\rightarrow} \beta_1 \left(\frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e} \right)$} \framesubtitle{$W_i = X_i + e_i$} \begin{itemize} \item $\widehat{\beta}_1$ converges to $\beta$ times the reliability of $W_i$. \item It's inconsistent. \pause \item Because the reliability is less than one, it's asymptotically biased toward zero. \item The worse the measurement of $X_i$, the more the asymptotic bias. \pause \item Sometimes called ``attenuation" (weakening). \item If a good estimate of reliability is available from another source, one can ``correct for attenuation." \pause \item When $H_0:\beta_1=0$ is true, it's not a serious problem. \pause \item False sense of security? \end{itemize} \end{frame} \begin{frame} \frametitle{Measurement error in two explanatory variables} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{MeReg2Path} \end{center} \pause Want to assess the relationship of $X_2$ to $Y$, \emph{controlling} for $X_1$ by testing $H_0:\beta_2=0$. \end{frame} \begin{frame} \frametitle{Statement of the model} \framesubtitle{Independently for $i=1, \ldots,n$} \begin{eqnarray} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \epsilon_i \nonumber \\ W_{i,1} & = & X_{i,1} + e_{i,1} \nonumber \\ W_{i,2} & = & X_{i,2} + e_{i,2}, \pause \nonumber \end{eqnarray} {\footnotesize where \begin{itemize} \item[] $E(X_{i,1})=\mu_1$, $E(X_{i,2})=\mu_2$, $E(\epsilon_i) = E(e_{i,1}) = E(e_{i,2}) = 0$, \item[] $Var(\epsilon_i)=\psi$, $Var(e_{i,1})=\omega_1$, $Var(e_{i,2})=\omega_2$, \item[] The errors $\epsilon_i, e_{i,1}$ and $e_{i,2}$ are all independent, \item[] $X_{i,1}$ and $X_{i,2}$ are independent of $\epsilon_i, e_{i,1}$ and $e_{i,2}$, and \end{itemize} \begin{displaymath} cov\left( \begin{array}{c} X_{i,1} \\ X_{i,1} \end{array} \right) = \left( \begin{array}{c c} \phi_{11} & \phi_{12} \\ \phi_{12} & \phi_{22} \end{array} \right). \end{displaymath} \pause } % End size Note \begin{itemize} \item Reliability of $W_1$ is $\frac{\phi_{11}}{\phi_{11}+\omega_1}$. \item Reliability of $W_2$ is $\frac{\phi_{22}}{\phi_{22}+\omega_2} $. \end{itemize} \end{frame} \begin{frame} \frametitle{True Model versus Naive Model} %\framesubtitle{Independently for $i=1, \ldots,n$} \pause %\vspace{1mm} True model: \begin{eqnarray} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \epsilon_i \nonumber \\ W_{i,1} & = & X_{i,1} + e_{i,1} \nonumber \\ W_{i,2} & = & X_{i,2} + e_{i,2}, \nonumber \end{eqnarray} Naive model: $Y_i = \beta_0 + \beta_1 W_{i,1} + \beta_2 W_{i,2} + \epsilon_i$ \pause \vspace{3mm} \begin{itemize} \item Fit the naive model. \item See what happens to $\widehat{\beta}_2$ as $n \rightarrow \infty$ when the true model holds. \pause \item Start by calculating $cov(\mathbf{d}_i) = cov \left( \begin{array}{c} W_{i,1} \\ W_{i,2} \\Y_i \end{array} \right)$. \end{itemize} \end{frame} \begin{frame} \frametitle{Covariance matrix of the observable data} \pause % \vspace{5mm} {\footnotesize \begin{eqnarray*} \boldsymbol{\Sigma} &=& cov\left(\begin{array}{c} W_{i,1} \\ W_{i,2} \\ Y_i \end{array}\right) \\ \pause && \\ &=& \left(\begin{array}{rrr} \omega_{1} + \phi_{11} & \phi_{12} & \beta_{1} \phi_{11} + \beta_{2} \phi_{12} \\ \phi_{12} & \omega_{2} + \phi_{22} & \beta_{1} \phi_{12} + \beta_{2} \phi_{22} \\ \beta_{1} \phi_{11} + \beta_{2} \phi_{12} & \beta_{1} \phi_{12} + \beta_{2} \phi_{22} & \beta_{1}^{2} \phi_{11} + 2 \, \beta_{1} \beta_{2} \phi_{12} + \beta_{2}^{2} \phi_{22} + \psi \end{array}\right) % Last matrix pasted in from Sage! \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{What happens to $\widehat{\beta}_2$ as $n \rightarrow \infty$?} \framesubtitle{Interested in $H_0:\beta_2=0$} {\footnotesize \begin{eqnarray*}\label{abias} \widehat{\beta}_2 &=& \frac{\widehat{\sigma}_{11}\widehat{\sigma}_{23} - \widehat{\sigma}_{12}\widehat{\sigma}_{13}} {\widehat{\sigma}_{11}\widehat{\sigma}_{22} - \widehat{\sigma}_{12}^2} \\ \pause &\stackrel{p}{\rightarrow}& \frac{\sigma_{11}\sigma_{23} - \sigma_{12}\sigma_{13}} {\sigma_{11}\sigma_{22} - \sigma_{12}^2} \nonumber \\ \pause & = & \frac{{\beta_{1} \omega_{1} \phi_{12} + \beta_{2} (\omega_{1}\phi_{22} + \phi_{11} \phi_{22}-\phi_{12}^{2}} ) } {(\phi_{1,1} + \omega_1)(\phi_{2,2} + \omega_2) - \phi_{12}^{2}} \\ \pause & \neq & \beta_2 \end{eqnarray*} } % End size Inconsistent. \end{frame} \begin{frame} \frametitle{When $H_0:\beta_2=0$ is true} \pause %\framesubtitle{} {\LARGE \begin{displaymath} \widehat{\beta}_2 \stackrel{p}{\rightarrow} \frac{\beta_{1} \omega_{1} \phi_{12}} {(\phi_{1,1} + \omega_1)(\phi_{2,2} + \omega_2) - \phi_{12}^{2}} \end{displaymath} \pause } % End size So $ \widehat{\beta}_2$ goes to the wrong target unless \begin{itemize} \item There is no relationship between $X_1$ and $Y$, or \item There is no measurement error in $W_1$, or \item There is no correlation between $X_1$ and $X_2$. \pause \end{itemize} \vspace{2mm} Also, the $t$ statistic for $H_0: \beta_2=0$ goes to plus or minus $\infty$ and the $p$-value $\stackrel{p}{\rightarrow} 0$. Remember, $H_0$ is true. \end{frame} \begin{frame} \frametitle{How bad is it for finite sample sizes?} \framesubtitle{$\widehat{\beta}_2 \stackrel{p}{\rightarrow} \frac{\beta_{1} \omega_{1} \phi_{12}} {(\phi_{1,1} + \omega_1)(\phi_{2,2} + \omega_2) - \phi_{12}^{2}}$} \pause A big simulation study (Brunner and Austin, 2009) with six factors \pause \begin{itemize} \item Sample size: $n$ = 50, 100, 250, 500, 1000 \item $Corr(X_1,X_2)$: $\phi_{12}$ = 0.00, 0.25, 0.75, 0.80, 0.90 \item Proportion of variance in $Y$ explained by $X_1$: 0.25, 0.50, 0.75 \item Reliability of $W_1$: 0.50, 0.75, 0.80, 0.90, 0.95 \item Reliability of $W_2$: 0.50, 0.75, 0.80, 0.90, 0.95 \item Distribution of latent variables and error terms: Normal, Uniform, $t$, Pareto. \pause \end{itemize} There were $5\times 5\times 3\times 5\times5\times 4$ = 7,500 treatment combinations. \end{frame} \begin{frame} \frametitle{Simulation study procedure} %\framesubtitle{} Within each of the $5\times 5\times 3\times 5\times5\times 4$ = 7,500 treatment combinations, \begin{itemize} \item 10,000 random data sets were generated \pause \item For a total of 75 million data sets \pause \item All generated according to the true model, with $\beta_2=0$. \item Fit naive model, test $H_0:\beta_2=0$ at $\alpha= 0.05$. \pause \item Proportion of times $H_0$ is rejected is a Monte Carlo estimate of the Type I Error Probability. \pause \item It should be around 0.05. \end{itemize} \end{frame} \begin{frame} \frametitle{Look at a small part of the results} \pause %\framesubtitle{} \begin{itemize} \item Both reliabilities = 0.90 \item Everything is normally distributed \item $\beta_0=1$, $\beta_1=1$ and of course $\beta_2=0$. \end{itemize} \end{frame} \begin{frame} \frametitle{Table 1 of Brunner and Austin (2009, p.39)} \framesubtitle{\emph{Canadian Journal of Statistics}, Vol. 37, Pages 33-46, Used without permission} \begin{center} \includegraphics[width=3in]{BrunnerAustinTable1} \end{center} \end{frame} \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=5in]{Weak} \end{center} \end{frame} \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=5in]{Medium} \end{center} \end{frame} \begin{frame} \frametitle{} %\framesubtitle{} \begin{center} \includegraphics[width=5in]{Strong} \end{center} \end{frame} \begin{frame} \frametitle{Marginal Mean Type I Error Probabilities} \begin{center} \includegraphics[width=3.5in]{MarginalMeans} \end{center} \end{frame} \begin{frame} \frametitle{Summary} \framesubtitle{} \begin{itemize} \item Ignoring measurement error in the explanatory variables can seriously inflate Type I error probabilities. \pause \item The poison combination is measurement error in the variable for which you are ``controlling," and correlation between latent explanatory variables. \item If either is zero, there is no problem. {\Large \begin{displaymath} \widehat{\beta}_2 \stackrel{p}{\rightarrow} \frac{\beta_{1} \omega_{1} \phi_{12}} {(\phi_{1,1} + \omega_1)(\phi_{2,2} + \omega_2) - \phi_{12}^{2}} \end{displaymath} \pause } % End size \item Factors affecting severity of the problem are (next slide) \end{itemize} \end{frame} \begin{frame} \frametitle{Factors affecting severity of the problem} \framesubtitle{Problem of inflated Type I error probability} \begin{itemize} \item As the correlation between $X_1$ and $X_2$ increases, the problem gets worse. \item As the correlation between $X_1$ and $Y$ increases, the problem gets worse. \item As the amount of measurement error in $X_1$ increases, the problem gets worse. \item As the amount of measurement error in $X_2$ increases, the problem gets \emph{less} severe. \item As the sample size increases, the problem gets worse. \item Distribution of the variables does not matter much. \end{itemize} \end{frame} \begin{frame} \frametitle{As the sample size increases, the problem gets worse} %\framesubtitle{} For a large enough sample size, no amount of measurement error in the explanatory variables is safe, assuming that the latent explanatory variables are correlated. \end{frame} \begin{frame} \frametitle{Other kinds of regression, other kinds of measurement error} \pause %\framesubtitle{} \begin{itemize} \item Logistic regression \item Proportional hazards regression in survival analysis \item Log-linear models: Test of conditional independence in the presence of classification error \item Median splits \item Even converting $X_1$ to ranks inflates Type I Error probability. \end{itemize} \end{frame} \begin{frame} \frametitle{Moral of the story} %\framesubtitle{} Use models that allow for measurement error in the explanatory variables. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/431s23} {\footnotesize \texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} % \stackrel{c}{\mathbf{X}} \stackrel{\top}{\vphantom{r}_i} % Centered X_i Transpose % \stackrel{c}{X}\stackrel{2}{\vphantom{r}_i} % Centered X_i^2 % \stackrel{c}{X}\stackrel{2}{\vphantom{r}} % Centered X^2