% Applied Stat I: Introduction with Regression as an example and takeoff point % Notes and comments at the end % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{ Methods of Applied Statistics\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2017} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \section{Introduction} \begin{frame} \frametitle{Goal of statistical analysis} The goal of statistical analysis is to draw reasonable conclusions from noisy numerical data. \end{frame} \begin{frame}{Steps in the process of statistical analysis} {One approach} \pause \begin{itemize} \item Consider a fairly realistic example or problem. \pause \item Decide on a statistical model. \pause \item Perhaps decide sample size. \pause \item Acquire data. \pause \item Examine and clean the data; generate displays and descriptive statistics. \pause \item Estimate model parameters, for example by maximum likelihood. \pause \item Carry out tests, compute confidence intervals, or both. \pause \item Perhaps re-consider the model and go back to estimation. \pause \item Based on the results of estimation and inference, draw conclusions about the example or problem. \end{itemize} \end{frame} % Note back and forth \begin{frame} \frametitle{Two domains} %\framesubtitle{} \begin{itemize} \item The messy outside world of reality and data. \item The pure world of the statistical model. \pause \item Applied Statistics navigates the interface. \end{itemize} \end{frame} \begin{frame} \frametitle{What is a statistical model?} \framesubtitle{You should always be able to state the model} \pause {\small A \emph{statistical model} is a set of assertions that partly specify the probability distribution of the observable data. The specification may be direct or indirect. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a Poisson distribution with expected value $\lambda$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i$, where \pause \begin{itemize} \item[] $\beta_0, \ldots, \beta_{p-1}$ are unknown constants (parameters). \item[] $x_{i,j}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant (parameter). \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \end{itemize} \pause \vspace{2mm} Is the model the same thing as the \emph{truth}? } % End size \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause The parameter space is $\{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_{p-1}$ are unknown constants. \item[] $x_{i,j}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \pause The parameter space is $\{(\beta_0, \ldots, \beta_{p-1}, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} % In 2016 and earlier, went to the taste test or some other simple example. \begin{frame} \frametitle{The SENIC data} \pause %\framesubtitle{} {\LARGE \begin{itemize} \item[] \textbf{S}tudy of the \item[] \textbf{E}fficacy of \item[] \textbf{N}osocomial \item[] \textbf{I}nfection \item[] \textbf{C}ontrol \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{Background} %\framesubtitle{} The Study on the Efficacy of Nosocomial Infection Control (SENIC) data are from a study of infections acquired in hospital. \pause That is, patients are admitted to hospital for something, and while in hospital they get infections (such as pneumonia and urinary tract infections) that are unrelated to why they were admitted, and require treatment. \pause This is a partial reconstructed data set based on one in Kutner et al.'s \emph{Applied Linear Statistical Models}. In this aggregated data set, the cases are 100 U.S.~hospitals. \end{frame} \begin{frame} \frametitle{Variables in the openSENIC data set} \pause % \framesubtitle{} {\footnotesize \begin{tabular}{ll} & Hospital identification number \\ region & Geographic region of U.S. \\ mdschl & Medical school affiliation (Yes or No) \\ census & Number of patients \\ nbeds & Number of beds in the hospital \\ nurses & Number of nurses \\ lngstay & Mean length of stay in days \\ age & Mean age of patient in years \\ xratio & Ratio of number of X-rays performed to number of patients \\ & without signs or symptoms of pneumonia, $\times$ 100 \\ culratio & Ratio of number of cultures performed to number of patients \\ & without signs or symptoms of hospital-acquired infection, $\times$ 100 \\ infpercent & Percentage of patients who acquired an infection while in hospital \end{tabular} } % End size \end{frame} \begin{frame} \frametitle{Xratio and Culratio} \framesubtitle{Two strange variables} \pause \begin{itemize} \item \textbf{xratio}: Ratio of number of X-rays performed to number of patients without signs or symptoms of pneumonia, $\times$ 100 \item \textbf{culratio}: Ratio of number of cultures performed to number of patients without signs or symptoms of hospital-acquired infection, $\times$ 100 \end{itemize} \pause Sometimes mini-epidemics spread through a hospital. The variables xratio and culratio represent special efforts to monitor the health of patients who show no signs of having gotten sick in hospital, yet. \pause They are a kind of early warning system, intended to detect outbreaks of disease in the hospital so they can be dealt with before they get established. \pause My guess is that xratio is primarily for pneumonia, and culratio is primarily for urinary tract infections. \end{frame} \begin{frame} \frametitle{Let's do ``Regression"} \pause % You know, ... %\framesubtitle{} \begin{center} \includegraphics[width=3in]{GPA_Scatterplot} \end{center} \end{frame} % Explanatory. response variables. \begin{frame} \frametitle{Least squares Plane} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{LSplane} % Stolen from somewhere -- no Creative Commons license. \end{center} \end{frame} \begin{frame} \frametitle{Regression?} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{GPA_Scatterplot} \end{center} \end{frame} \begin{frame} \frametitle{Francis Galton (1822-1911) studied ``Hereditary Genius" (1869) and other traits} \pause %\framesubtitle{} \begin{itemize} \item Heights of fathers and sons \pause \begin{itemize} \item Sons of the tallest fathers tended to be taller than average, but shorter than their fathers. \pause \item Sons of the shortest fathers tended to be shorter than average, but taller than their fathers. \pause \end{itemize} \item This kind of thing was observed for many other traits. \pause \item Galton was deeply concerned about ``regression to mediocrity." \end{itemize} \end{frame} \begin{frame} \frametitle{Measure the same thing twice, with error} \framesubtitle{Suppose that's all it is} \pause \begin{tabular}{l} $Y_1 = X + e_1$ \\ $Y_2 = X + e_2$ \\ \pause $X \sim N(\mu,\sigma^2_x)$ \\ $e_1$ and $e_2$ independent $N(0,\sigma^2_e)$ \end{tabular} \pause \vspace{5mm} Then, \pause \begin{displaymath} \left( \begin{array}{c} Y_1 \\ Y_2 \end{array} \right) \sim N\left( \left[\begin{array}{c} \mu \\ \mu \end{array} \right], \left[\begin{array}{cc} \sigma^2_x+\sigma^2_e & \sigma^2_x \\ \sigma^2_x & \sigma^2_x+\sigma^2_e \end{array} \right] \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{Conditional distribution of $Y_2$ given $Y_1=y_1$} \framesubtitle{For the bivariate normal} \pause \begin{eqnarray*} & & N\left(\mu_2 + \frac{\sigma_2}{\sigma_1} \, \rho(y_1-\mu_1), (1-\rho^2)\sigma^2_2 \right) \\ \pause &&\\ &=& N\left(\mu + \rho(y_1-\mu), (1-\rho^2)(\sigma^2_x+\sigma^2_e) \right) \pause \mbox{ where} \\ \pause &&\\ \rho &=& \frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e}. \end{eqnarray*} \pause % \vspace{5mm} Have \begin{eqnarray*} E(Y_2|Y_1=y_1) & = & \mu + \rho(y_1-\mu) \\ \pause & = & \mu(1-\rho) + \rho \, y_1 \\ \pause & = & ~~~~~\beta_0 ~~~+ \beta_1 \, y_1 \pause \end{eqnarray*} Regression. \end{frame} \begin{frame} \frametitle{$E(Y_2|Y_1=y_1) = \mu + \rho(y_1-\mu)$} \framesubtitle{Where $\rho = \frac{\sigma^2_x}{\sigma^2_x+\sigma^2_e}$} \pause \begin{itemize} \item If $y_1$ is above the mean, average $y_2$ will also be above the mean. \pause \item But only a fraction $\rho$ as far above as $y_1$. \pause \item If $y_1$ is below the mean, average $y_2$ will also be below the mean. \pause \item But only a fraction $\rho$ as far below as $y_1$. \pause \item This exactly the ``regression toward the mean" that Galton observed. \end{itemize} \end{frame} \begin{frame} \frametitle{Regression toward the mean} %\framesubtitle{} \begin{itemize} \item Does not imply systematic change over time. \pause \item Is a characteristic of the bivariate normal and other joint distributions. \pause \item Can produce very misleading results, especially in the evaluation of social and educational programs. \end{itemize} \end{frame} \begin{frame} \frametitle{Regression Artifact} \pause %\framesubtitle{} \begin{itemize} \item Measure something important, like performance in school or blood pressure. \pause \item Select an extreme group, usually those who do worst on the baseline measure. \pause \item Do something useless to help them, and measure again. \pause \end{itemize} \begin{displaymath} E(Y_2|Y_1=y_1) = \mu + \rho(y_1-\mu) \end{displaymath} \pause Even if the treatment does nothing, they are expected to do worse than average, \pause but still better than they did the first time –- completely artificial! \pause And the apparent effect for those who do better than average on baseline will be negative. \end{frame} \section{Normal Linear Model} \begin{frame} \frametitle{General Mixed Linear Model} \framesubtitle{With both fixed and random effects} {\LARGE \begin{displaymath} \mathbf{y}=\mathbf{X} \boldsymbol{\beta} + \mathbf{Zb} + \boldsymbol{\epsilon} \end{displaymath} \pause } % End size \begin{itemize} \item $\mathbf{X}$ is an $n \times p$ matrix of known constants. \pause \item $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants. \pause \item $\mathbf{Z}$ is an $n \times q$ matrix of known constants. \pause \item $\mathbf{b} \sim N_q(\mathbf{0},\boldsymbol{\Sigma}_b)$ with $\boldsymbol{\Sigma}_b$ unknown. \pause \item $\boldsymbol{\epsilon} \sim N(\mathbf{0},\sigma^2 \mathbf{I}_n)$ , where $\sigma^2 > 0$ is an unknown constant. \end{itemize} \end{frame} \begin{frame} \frametitle{Fixed Effects Linear Regression} %\framesubtitle{} {\LARGE \begin{displaymath} \mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon} \end{displaymath} \pause } % End size \begin{itemize} \item $\mathbf{X}$ is an $n \times p$ matrix of known constants. \item $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants. \item $\boldsymbol{\epsilon} \sim N(\mathbf{0},\sigma^2 \mathbf{I}_n)$ , where $\sigma^2 > 0$ is an unknown constant. \item[] \pause \item $\widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y} $ \item $\widehat{\mathbf{y}}=\mathbf{X}\widehat{\boldsymbol{\beta}} $ \item $\mathbf{e}= (\mathbf{y}-\widehat{\mathbf{y}})$ \end{itemize} \end{frame} \begin{frame} \frametitle{Comparing scalar and matrix form} %\framesubtitle{} Scalar form is $y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1}x_{i,p-1} + \epsilon_i $ \pause \begin{equation*} \begin{array}{cccccccc} % 6 columns \mathbf{y} & = & \mathbf{X} & \boldsymbol{\beta} & + & \boldsymbol{\epsilon} \\ \pause &&&&& \\ % Another space \left( \begin{array}{c} y_1 \\ y_2 \\ y_3 \\ \vdots \\ y_n \end{array} \right) &=& \left(\begin{array}{cccc} 1 & 14.2 & \cdots & 1 \\ 1 & 11.9 & \cdots & 0 \\ 1 & ~3.7 & \cdots & 0 \\ \vdots & \vdots & \vdots & \vdots \\ 1 & ~6.2 & \cdots & 1 \end{array}\right) & \left( \begin{array}{c} \beta_0 \\ \beta_1 \\ \vdots \\ \beta_{p-1} \end{array} \right) &+& \left( \begin{array}{c} \epsilon_1 \\ \epsilon_2 \\ \epsilon_3 \\ \vdots \\ \epsilon_n \end{array} \right) \end{array} \end{equation*} \end{frame} \begin{frame} \frametitle{Vocabulary} %\framesubtitle{} \begin{itemize} \item Explanatory variables are $x$ \item Response variable is $y$ \pause \item For the SENIC data, what's the response variable? \end{itemize} \end{frame} \begin{frame} \frametitle{Meaning of the statistical model} \framesubtitle{$y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1}x_{i,p-1} + \epsilon_i $} \pause \begin{itemize} \item There are $p-1$ explanatory variables. \pause \item For each \emph{combination} of explanatory variables, the conditional distribution of the response variable $y$ is normal, with constant variance $\sigma^2$. \pause \item The conditional population mean of $y$ depends on the $x$ values, as follows: \pause \end{itemize} {\LARGE \begin{displaymath} E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_0 + \beta_1x_1 + \ldots + \beta_{p-1}x_{p-1} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{``Control" means hold constant} %\framesubtitle{} \begin{itemize} \item Regression model with four explanatory variables. \pause \item Hold $x_1$, $x_2$ and $x_4$ constant at some fixed values. \pause \begin{eqnarray*} E(Y|\boldsymbol{X}=\boldsymbol{x}) & = & \beta_0 + \beta_1x_1 + \beta_2x_2 +\beta_3x_3 + \beta_4x_4 \\ \pause & = & (\beta_0 + \beta_1x_1 + \beta_2x_2 + \beta_4x_4) + \beta_3x_3 \\ \pause \end{eqnarray*} \item The equation of a straight line with slope $\beta_3$. \pause \item Values of $x_1$, $x_2$ and $x_4$ affect only the intercept. \pause \item So $\beta_3$ is the rate at which $E(Y|\mathbf{x})$ changes as a function of $x_3$ with all other variables held constant at fixed levels. \pause \item \emph{According to the model}. \end{itemize} \end{frame} \begin{frame} \frametitle{It's model-based control} \pause %\framesubtitle{} \begin{itemize} \item To ``hold $x_1$ constant" at some particular value, like $x_1=14$, you don't even need data at that value. \pause \item Ordinarily, to estimate $E(Y|X_1=14,X_2=x)$, you would need a lot of data at $X_1=14$. \pause \item But look: $\widehat{Y} = \widehat{\beta}_0 + \widehat{\beta}_1 \cdot 14 + \widehat{\beta}_2 x$ \pause \item Of course extrapolation is still dangerous. \end{itemize} \end{frame} \begin{frame} \frametitle{Three Meanings of ``Control"} \pause %\framesubtitle{} \begin{itemize} \item Procedural \pause \item Sub-division \pause \item Model-based \end{itemize} \end{frame} \begin{frame} \frametitle{More vocabulary} \framesubtitle{$E(Y|\boldsymbol{X}=\boldsymbol{x}) = (\beta_0 + \beta_1x_1 + \beta_2x_2 + \beta_4x_4) + \beta_3x_3$} \pause \begin{itemize} \item If $\beta_3>0$, describe the relationship between $x_3$ and (expected) $y$ as ``positive," \pause controlling for the other variables. \pause If $\beta_3<0$, negative. \pause \item Useful ways of saying ``controlling for" or ``holding constant" include \pause \begin{itemize} \item Allowing for \item Correcting for \item Taking into account \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Categorical Explanatory Variables} \pause \framesubtitle{Unordered categories} \begin{itemize} \item $X=1$ means Drug, $X=0$ means Placebo. \pause \item Population mean is $E(Y|X=x) = \beta_0 + \beta_1 x$. \pause \item For patients getting the drug, mean response is \pause $E(Y|X=1) = \beta_0 + \beta_1$ \pause \item For patients getting the placebo, mean response is \pause $E(Y|X=0) = \beta_0$ \pause \item And $\beta_1$ is the difference between means, the average treatment effect. \end{itemize} \end{frame} \begin{frame} \frametitle{Similar interpretation for sample regression coefficients} \framesubtitle{Think of a scatterplot} \pause % This means make a picture. \begin{itemize} \item $X=1$ means Drug, $X=0$ means Placebo. \item Predicted response is $\widehat{Y} = \widehat{\beta}_0 + \widehat{\beta}_1x $. \pause \item For patients getting the drug, predicted response is $\widehat{Y} = \widehat{\beta}_0 + \widehat{\beta}_1 = \overline{Y}_1 $. \pause \item For patients getting the placebo, predicted response is $\widehat{Y} = \widehat{\beta}_0 = \overline{Y}_0 $. \pause \item And $\widehat{\beta}_1$ is the difference between sample means, the estimated average treatment effect. \end{itemize} \end{frame} \begin{frame} \frametitle{Regression test of $H_0: \beta_1=0$} \framesubtitle{With a binary explanatory variable} \pause \begin{itemize} \item Same as an independent $t$-test. \pause \item Same as a one-way ANOVA with 2 categories. \pause \item Same $t$, same $F$, same $p$-value. \end{itemize} \end{frame} \begin{frame} \frametitle{Testing Statistical Hypotheses} \pause %\framesubtitle{} \begin{itemize} \item Parameters, parameter space \pause \item Null hypothesis \pause \item Significance level $\alpha$. \pause \item $p$-value. \pause \item Function in the discourse of Science: Skeptic in a box. \end{itemize} \end{frame} \begin{frame} \frametitle{Plain language is important} \pause %\framesubtitle{} { \footnotesize \begin{itemize} \item If you can only be understood by mathematicians and statisticians, your knowledge is much less valuable. \pause \item Often a question will say ``Give the answer in plain, non-statistical language." \pause \item This means if $x$ is income and $y$ is credit card debt, you make a statement about income and average or predicted credit card debt, like ``customers with higher incomes tend to have less credit card debt." \pause \item If you use mathematical notation or words like null hypothesis, unbiased estimator, p-value or statistically significant, you will lose a lot of marks even if the statement is correct. Even avoid ``positive relationship," and so on. \pause \item If the study is about fish, talk about fish. \pause \item If the study is about blood pressure, talk about blood pressure. \pause \item If the study is about breaking strength of yarn, talk about breaking strength of yarn. \pause \item Assume you are talking to your boss, a former Commerce major who got a D+ in ECO220 and does not like to feel stupid. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{We will be guided by hypothesis tests with $\alpha = 0.05$} \framesubtitle{For plain-language conclusions} \pause {\footnotesize \begin{itemize} \item If we do not reject a null hypothesis like $H_0:\beta_1=0$, we will not draw a definite conclusion. \pause \item Instead, say things like: \pause \begin{itemize} \item There is no evidence of a connection between blood sugar level and mood. \pause \item These results are not strong enough for us to conclude that attractiveness is related to mark in first-year Computer Science. \pause \item These results are consistent with no effect of dosage level on bone density. \pause \end{itemize} \item If the null hypothesis is not rejected, please do \emph{not} claim that the drug has no effect, etc.. \pause \item In this we are taking Fisher's side in a historical fight between Fisher on one side and Neyman \& Pearson on the other. \pause \item Though we are guided by $\alpha = 0.05$, we \emph{never} mention it when plain language is required. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{A technical issue} %\framesubtitle{} { \small \begin{itemize} \item In this class we will mostly avoid one-tailed tests. \pause \item Why? Ask what would happen if the results were strong and in the opposite direction to what was predicted (dental example). \pause \item But when $H_0$ is rejected, we still draw directional conclusions. \pause \item For example, if $x$ is income and $y$ is credit card debt, we test $H_0: \beta_1=0$ with a two-sided $t$-test. \pause \item Say $p = 0.0021$ and $\widehat{\beta}_1 = 1.27$. \pause We say ``Consumers with higher incomes tend to have more credit card debt." \pause \item Is this justified? We'd better hope so, or all we can say is ``There is a connection between income and average credit card debt." \pause \item Then they ask: ``What's the connection? Do people with lower income have more debt?" \pause \item And you have to say ``Sorry, I don't know." \pause \item It's a good way to get fired, or at least look silly. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{The technical resolution} %\framesubtitle{} \begin{itemize} \item Decompose the two-sided test into a set of two one-sided tests with significance level $\alpha/2$, equivalent to the two-sided test (explain and draw a picture). \pause \item In practice, just look at the sign of the regression coefficient. \pause \item Under the surface you are decomposing the two-sided test, but you never mention it. \pause \item \emph{Marking rule}: If the question asks for plain language and you draw a non-directional conclusion when a directional conclusion is possible, you get half marks at most. \end{itemize} \end{frame} \begin{frame} \frametitle{Correlation-causation} \framesubtitle{More on how to talk (and think) about the results} \pause {\footnotesize \begin{itemize} \item Suppose the two conditions were standard treatment versus new treatment. \pause \item $x=0$ means standard treatment, $x=1$ means new treatment. \pause \item We could collect data on people who were treated for the disease, observe whether they got the standard treatment or the new treatment, and also observe $y$ to see how they did. \pause \item Suppose $H_0: \mu_1=\mu_2$ is rejected, and patents receiving the new treatment did better on average. \pause \item Is the new treatment better? \pause \item Maybe, \pause but it's also possible that those receiving the new treatment were more motivated, or more educated, or healthier in the first place (so they have energy to pursue non-standard options). \pause \item Controlling for those possibilities is a good idea, but will you think of everything? \pause \item The standard saying is ``Correlation does not imply causation." \pause \item Correlation means association between variables. \pause \item Causation means influence, not absolute determination. \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{More examples} %\framesubtitle{} \begin{itemize} \item Wearing a hat and baldness. \pause \item Exercise and arthritis pain. \pause \item The Mozart effect. \pause \item Alchohol consumption and health. \end{itemize} \end{frame} \begin{frame} \frametitle{In general} %\framesubtitle{} If $A$ and $B$ are related \pause \begin{itemize} \item $A$ could be influencing $B$. \item $B$ could be influencing $A$. \item $C$ could be influencing both $A$ and $B$. \pause \item $C$ is called a \emph{confounding variable}. \end{itemize} \end{frame} \begin{frame} \frametitle{Confounding variable} \pause %\framesubtitle{} \begin{itemize} \item Is related to both the explanatory variable and response variable \pause \item Causing an apparent relationship. \pause \item $A$ and $B$ are related only because they are both related to $C$. \pause \item Exercise and health. \pause You'd better control for age. \pause \item Controlling for age may not be enough. \end{itemize} \end{frame} \begin{frame} \frametitle{The discourse of science} %\framesubtitle{} \begin{itemize} \item These conversations follow a pattern. \pause \item ``Canadians who eat more fresh (less processed) food tend to have better health. Eat fresh vegetables!" \pause \item Reply: It's possible that people who eat fresh food tend to have higher income, and they are healthier for reasons connected to income rather than diet. Consider Aboriginals on reserve? Did you control for income? \pause \item ``We couldn't. Information about income was not available. But we controlled for education. \pause And we had a large sample." \pause \item Reply: Sorry, you lose. \end{itemize} \end{frame} \begin{frame} \frametitle{To shoot a study down on the basis of confounding variables}\pause %\framesubtitle{} All you need to do is \pause \begin{itemize} \item Name a specific \emph{potential} confounding variable that is not in the data. \pause \item Make a plausible case for why it \emph{might} be related to the explanatory variable. \pause \item Make a plausible case for why it might be related to the response variable. \pause \item Of course if you can cite specific published results your argument is even stronger. \pause \item Sometimes this is all it takes to show that person-years of effort and hundreds of thousands of dollars in grant money were wasted. \end{itemize} \end{frame} \begin{frame} \frametitle{The solution: Random assignment} \framesubtitle{Again, $x=0$ means standard treatment and $x=1$ means new treatment} \pause \begin{itemize} \item What if patients were randomly assigned to treatment? \pause \item In an \emph{experimental study}, subjects are randomly assignent to treatment conditions --- values of a categorical explanatory variable --- and values of the response variable are observed. \pause \item In an \emph{observational study}, values of the explanatory and response variables are just observed. \pause \item In a well-designed experimental study, confounding variables are ruled out. \pause \item $B \rightarrow A$ is ruled out too. \pause \item Thank you, Mr. Fisher. \end{itemize} \end{frame} \begin{frame} \frametitle{Talking about the results of a purely observational study} \pause %\framesubtitle{} Avoid language that implies causality or influence. \pause {\footnotesize \begin{itemize} \item Don't say ``Music lessons led to better academic performance." \pause \item Say ``Students who had private music lessons tended to have better academic performance." \pause \item A good follow-up might be ``Music lessons may stimulate cognitive development, but it's also possible that students who had private music lessons were different in other ways, such as average income or parents' education." \pause \item Don't say ``Solving puzzles on a regular basis tended to provide protection against the development of dementia." \pause \item Say ``Participants who solved puzzles on a regular basis tended to develop dementia later in life than those who did not solve puzzles on a regular basis." \pause \item It is okay to follow up with ``Solving puzzles may provide mental simulation that slows the onset of dementia." \pause \item But then say \pause ``Or, it is possible that early stages of dementia that are difficult to detect may lead to decreased interest in solving puzzles." \end{itemize} } % End size \end{frame} \begin{frame} \frametitle{More than Two Categories} \framesubtitle{We need this for the SENIC study} \pause Suppose a study has 3 treatment conditions. For example Group 1 gets Drug 1, Group 2 gets Drug 2, and Group 3 gets a placebo, so that the Explanatory Variable is Group (taking values 1,2,3) and there is some Response Variable $Y$ (maybe response to drug again). \pause \vspace{10mm} Why is $E[Y|X=x] = \beta_0 + \beta_1x$ (with $x$ = Group) a silly model? \end{frame} \begin{frame} \frametitle{Indicator Dummy Variables} \framesubtitle{With intercept} \pause \begin{itemize} \item $x_1 = 1$ if Drug A, zero otherwise \item $x_2 = 1$ if Drug B, zero otherwise \pause \item $E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_0 + \beta_1x_1 + \beta_2 x_2$. \pause \item Fill in the table. \pause \end{itemize} {\begin{center} \begin{tabular}{|c|c|c|l|} \hline Drug & $x_1$ & $x_2$ & $E(Y|\mathbf{x}) = \beta_0 + \beta_1x_1 + \beta_2 x_2$ \\ \hline $A$ & & & $\mu_1$ = \\ \hline $B$ & & & $\mu_2$ = \\ \hline Placebo & & & $\mu_3$ = \\ \hline \end{tabular} \end{center}} \end{frame} \begin{frame} \frametitle{Answer} \begin{itemize} \item $x_1 = 1$ if Drug A, zero otherwise \item $x_2 = 1$ if Drug B, zero otherwise \item $E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_0 + \beta_1x_1 + \beta_2 x_2$. \pause \end{itemize} {\begin{center} \begin{tabular}{|c|c|c|l|} \hline Drug & $x_1$ & $x_2$ & $E(Y|\mathbf{x}) = \beta_0 + \beta_1x_1 + \beta_2 x_2$ \\ \hline $A$ & 1 & 0 & $\mu_1$ = $\beta_0 + \beta_1$ \\ \hline $B$ & 0 & 1 & $\mu_2$ = $\beta_0 + \beta_2$ \\ \hline Placebo & 0 & 0 & $\mu_3$ = $\beta_0$ \\ \hline \end{tabular} \end{center}} \pause Regression coefficients are contrasts with the category that has no indicator -- the \emph{reference category}. \end{frame} \begin{frame} \frametitle{Indicator dummy variable coding with intercept} \pause %\framesubtitle{} \begin{itemize} \item With an intercept in the model, need $p-1$ indicators to represent a categorical explanatory variable with $p$ categories. \pause \item If you use $p$ dummy variables and an intercept, trouble. \pause \item Regression coefficients are contrasts with the category that has no indicator. \pause \item Call this the \emph{reference category}. \end{itemize} \end{frame} \begin{frame} \frametitle{$x_1 = 1$ if Drug A, zero o.w., $x_2 = 1$ if Drug B, zero o.w.} \pause %\framesubtitle{3-d Scatterplot} Recall $\sum_{i=1}^n (y_i-m)^2$ is minimized at $m = \overline{y}$ \pause \begin{center} \includegraphics[width=3in]{ABCscatter} \end{center} \end{frame} \begin{frame} \frametitle{What null hypotheses would you test?} \pause {\begin{center} \begin{tabular}{|c|c|c|l|} \hline Drug & $x_1$ & $x_2$ & $E(Y|\mathbf{x}) = \beta_0 + \beta_1x_1 + \beta_2 x_2$ \\ \hline $A$ & 1 & 0 & $\mu_1$ = $\beta_0 + \beta_1$ \\ \hline $B$ & 0 & 1 & $\mu_2$ = $\beta_0 + \beta_2$ \\ \hline Placebo & 0 & 0 & $\mu_3$ = $\beta_0$ \\ \hline \end{tabular} \end{center}} \pause \begin{itemize} \item Is the effect of Drug $A$ different from the placebo? \pause $H_0: \beta_1=0$ \pause \item Is Drug $A$ better than the placebo? \pause $H_0: \beta_1=0$ \pause \item Did Drug $B$ work? \pause $H_0: \beta_2=0$ \pause \item Did experimental treatment have an effect? \pause $H_0: \beta_1=\beta_2=0$ \pause \item Is there a difference between the effects of Drug $A$ and Drug $B$? \pause $H_0: \beta_1=\beta_2$ \end{itemize} \end{frame} % PP Slide 18 \begin{frame} \frametitle{Now add a quantitative explanatory variable (covariate)} \framesubtitle{Covariates often come first in the regression equation} \pause \begin{itemize} \item $x_1 = 1$ if Drug A, zero otherwise \item $x_2 = 1$ if Drug B, zero otherwise \item $x_3$ = Age \pause \item $E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_0 + \beta_1x_1 + \beta_2 x_2 + \beta_3 x_3$. \pause \end{itemize} {\begin{center} \begin{tabular}{|c|c|c|l|} \hline Drug & $x_1$ & $x_2$ & $E(Y|\mathbf{x}) = \beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3$\\ \hline A & 1 & 0 & $\mu_1$ = $(\beta_0+\beta_1)+\beta_3x_3$ \\ \hline B & 0 & 1 & $\mu_2$ = $(\beta_0+\beta_2)+\beta_3x_3$ \\ \hline Placebo & 0 & 0 & $\mu_3$ = ~~~~~$\beta_0$~~~~~$+\beta_3x_3$ \\ \hline \end{tabular} \end{center}} \pause Parallel regression lines. \end{frame} \begin{frame} \frametitle{More comments} %\framesubtitle{} {\begin{center} \begin{tabular}{|c|c|c|l|} \hline Drug & $x_1$ & $x_2$ & $E(Y|\mathbf{x}) = \beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3$\\ \hline A & 1 & 0 & $\mu_1$ = $(\beta_0+\beta_1)+\beta_3x_3$ \\ \hline B & 0 & 1 & $\mu_2$ = $(\beta_0+\beta_2)+\beta_3x_3$ \\ \hline Placebo & 0 & 0 & $\mu_3$ = ~~~~~$\beta_0$~~~~~$+\beta_3x_3$ \\ \hline \end{tabular} \end{center}} \pause \begin{itemize} \item If more than one covariate, parallel regression planes. \pause \item Non-parallel (interaction) is testable. \pause \item ``Controlling" interpretation holds. \pause \item In an experimental study, quantitative covariates are usually just observed. \pause \item Could age be related to drug? \pause \item Good covariates reduce MSE, make testing of categorical variables more sensitive. \end{itemize} \end{frame} \begin{frame} \frametitle{Cell means coding: $p$ indicators and no intercept} %\framesubtitle{} {\LARGE \begin{displaymath} E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_1x_1 + \beta_2 x_2 + \beta_3 x_3 \end{displaymath} \pause } % End size \vspace{3mm} \begin{center} \begin{tabular}{|c|c|c|c|c|} \hline Drug &$x_1$&$x_2$&$x_3$&$E(Y|\mathbf{x}) = \beta_1x_1+\beta_2x_2+\beta_3x_3$ \\ \hline A & 1 & 0 & 0 &$\mu_1=\beta_1$ \\ \hline B & 0 & 1 & 0 &$\mu_2=\beta_2$ \\ \hline Placebo & 0 & 0 & 1 &$\mu_3=\beta_3$ \\ \hline \end{tabular} \end{center} \pause \vspace{3mm} \begin{itemize} \item This model is equivalent to the one with $p-1$ dummy variables and the intercept. \pause \item If you have $p$ dummy variables and the intercept, the model is over-parameterized. \end{itemize} \end{frame} \begin{frame} \frametitle{Add a covariate: $x_4$} %\framesubtitle{} {\LARGE \begin{displaymath} E[Y|\boldsymbol{X}=\boldsymbol{x}] = \beta_1x_1 + \beta_2 x_2 + \beta_3 x_3 + \beta_4 x_4 \end{displaymath} \pause } % End size \begin{center} \begin{tabular}{|c|c|c|c|c|} \hline Drug &$x_1$&$x_2$&$x_3$&$E(Y|\mathbf{x}) = \beta_1x_1+\beta_2x_2+\beta_3x_3+\beta_4x_4$ \\ \hline A & 1 & 0 & 0 &$\beta_1+\beta_4x_4$ \\ \hline B & 0 & 1 & 0 &$\beta_2+\beta_4x_4$ \\ \hline Placebo & 0 & 0 & 1 &$\beta_3+\beta_4x_4$ \\ \hline \end{tabular} \end{center} \pause This model is equivalent to the one with the intercept. \end{frame} \begin{frame} \frametitle{Key to the equivalence of dummy variable coding schemes} \pause % \framesubtitle{} {\scriptsize \begin{center} \begin{tabular}{|c|c|c|c|c|} \hline Drug & $x_0$ & $x_1$ & $x_2$ & $\beta_0x_0+\beta_1x_1+\beta_2x_2+\beta_3x_3$\\ \hline A & 1 & 1 & 0 & $(\beta_0+\beta_1)+\beta_3x_3$ \\ \hline B & 1 & 0 & 1 & $(\beta_0+\beta_2)+\beta_3x_3$ \\ \hline Placebo & 1 & 0 & 0 & $ ~~~~~\beta_0$~~~~~$+\beta_3x_3$ \\ \hline \end{tabular} \vspace{1mm} \begin{tabular}{|c|c|c|c|c|} \hline Drug &$x_1$&$x_2$&$x_3$&$\beta_1x_1+\beta_2x_2+\beta_3x_3+\beta_4x_4$ \\ \hline A & 1 & 0 & 0 &$\beta_1+\beta_4x_4$ \\ \hline B & 0 & 1 & 0 &$\beta_2+\beta_4x_4$ \\ \hline Placebo & 0 & 0 & 1 &$\beta_3+\beta_4x_4$ \\ \hline \end{tabular} \end{center} \pause } % End size {\LARGE \begin{eqnarray*} & & \mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon} \\ \pause & \Leftrightarrow & \mathbf{y} = (\mathbf{X}\mathbf{A})(\mathbf{A}^{-1} \boldsymbol{\beta}) + \boldsymbol{\epsilon} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Partitioning sums of squares} \pause %\framesubtitle{} \begin{itemize} \item Variation to explain: \textbf{Total Sum of Squares} \begin{displaymath} \mbox{SSTO} = \sum_{i=1}^n (y_i - \overline{y})^2 \end{displaymath} \pause \item Variation that is still unexplained: \textbf{Error (or Residual) Sum of Squares} \begin{displaymath} \mbox{SSE} = \sum_{i=1}^n (y_i - \widehat{y}_i)^2 \end{displaymath} \pause \item Variation that is explained: \textbf{Regression (or Model) Sum of Squares} \begin{displaymath} \mbox{SSR = SSTO -- SSR} = \sum_{i=1}^n (\widehat{y}_i - \overline{y})^2 \end{displaymath} \end{itemize} \end{frame} \begin{frame} \frametitle{$R^2$: Proportion of variation in $y$ that is explained} \pause %\framesubtitle{} \begin{itemize} \item $SSTO = SSR+SSE$ \pause \item Proportion of variation in the response variable that is explained by the explanatory variable \pause \begin{displaymath} R^2 = \frac{SSR}{SSTO} \end{displaymath} \pause \item For a simple regression, same as the square of the correlation coefficient: $r^2 = R^2$ \end{itemize} \end{frame} \begin{frame} \frametitle{Hypothesis Testing} \framesubtitle{Standard tests when errors are normal} \pause \begin{itemize} \item Overall $F$-test for all the explanatory variables at once \pause $H_0: \beta_1 = \beta_2 = \cdots = \beta_{p-1} = 0$ \pause \item $t$-tests for each regression coefficient: Controlling for all the others, does that explanatory variable matter? \pause $H_0: \beta_j=0$ \pause \item Test a collection of explanatory variables controlling for another collection \pause $H_0: \beta_2 = \beta_3 = \beta_5 = 0$ \pause \item Example: Controlling for mother's education and father's education, are (any of) total family income, assessed value of home and total market value of all vehicles owned by the family related to High School GPA? \pause \item Most general: Testing whether sets of linear combinations of regression coefficients differ from specified constants. \pause $H_0: \mathbf{L}\boldsymbol{\beta} = \mathbf{h}$. \end{itemize} \end{frame} \begin{frame} \frametitle{Full versus Restricted Model} % Changing vocabulary: Reduced -> restricted \framesubtitle{Restricted by $H_0$} \pause \begin{itemize} \item You have 2 sets of variables, $A$ and $B$. Want to test $B$ controlling for $A$. \pause \item Fit a model with both $A$ and $B$: Call it the \emph{Full Model}. \pause \item Fit a model with just $A$: Call it the \emph{Restricted Model}. \\ \pause $R^2_F \geq R^2_R$. \pause \item The $F$-test is a likelihood ratio test (exact). \end{itemize} \end{frame} \begin{frame} \frametitle{When you add the $r$ more explanatory variables in set $B$, $R^2$ can only go up} \pause %\framesubtitle{} By how much? Basis of the $F$ test. \pause {\LARGE \begin{eqnarray*} F & = & \frac{(R^2_F-R^2_R)/r}{(1-R^2_F)/(n-p)} \\ \pause &&\\ & = & \frac{(SSR_F-SSR_R)/r}{MSE_F} \pause ~ \stackrel{H_0}{\sim} ~ F(r,n-p) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{General Linear Test of $H_0: \mathbf{L}\boldsymbol{\beta} = \mathbf{h}$} \framesubtitle{$\mathbf{L}$ is $r \times p$, rows linearly independent} \pause {\LARGE \begin{eqnarray*} F &=& \frac{(\mathbf{L}\widehat{\boldsymbol{\beta}}-\mathbf{h})^\top (\mathbf{L}(\mathbf{X}^\top \mathbf{X})^{-1}\mathbf{L}^\top)^{-1} (\mathbf{L}\widehat{\boldsymbol{\beta}}-\mathbf{h})} {r \, MSE_F} \\ &&\\ & \stackrel{H_0}{\sim} & F(r,n-p) \end{eqnarray*} \pause } % End size Equal to full-restricted formula. \end{frame} \begin{frame} \frametitle{Are the $x$ values really constants?} \framesubtitle{$Y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i$} \begin{itemize} \item In the general linear regression model, the $\mathbf{X}$ matrix is supposed to be full of fixed constants. \pause \item This is convenient mathematically. \pause Think of $E(\widehat{\boldsymbol{\beta}})$. \pause \item But in any non-experimental study, if you selected another sample, you'd get different $\mathbf{X}$ values, because of random sampling. \pause \item So $\mathbf{X}$ should be at least partly random variables, not fixed. \pause \item View the usual model as \emph{conditional} on $\mathbf{X}=\mathbf{x}$. \pause \item All the usual probabilities and expected values are \emph{conditional} probabilities and \emph{conditional} expected values. \pause \item But this would seem to mean that the \emph{conclusions} are also conditional on $\mathbf{X}=\mathbf{x}$. \end{itemize} \end{frame} \begin{frame} \frametitle{$\widehat{\boldsymbol{\beta}}$ is (conditionally) unbiased} %\framesubtitle{} {\Large \begin{displaymath} E(\widehat{\boldsymbol{\beta}}|\mathbf{X}=\mathbf{x}) = \boldsymbol{\beta} \pause \mbox{ for \emph{any} fixed }\mathbf{x}. \end{displaymath} } % End size \vspace{5mm} \pause It's \emph{unconditionally} unbiased too. \vspace{5mm} {\Large \begin{displaymath} E\{\widehat{\boldsymbol{\beta}}\} = E\{E\{\widehat{\boldsymbol{\beta}}|\mathbf{X}\}\} = E\{\boldsymbol{\beta}\} \pause = \boldsymbol{\beta} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{Perhaps Clearer} %\framesubtitle{} \begin{eqnarray*} E\{\widehat{\boldsymbol{\beta}}\} &=& E\{E\{\widehat{\boldsymbol{\beta}}|\mathbf{X}\}\} \\ \pause &=& \int \cdots \int E\{\widehat{\boldsymbol{\beta}}|\mathbf{X}=\mathbf{x}\} \, f(\mathbf{x}) \, d\mathbf{x} \\ \pause &=& \int \cdots \int \boldsymbol{\beta} \, f(\mathbf{x}) \, d\mathbf{x} \\ \pause &=& \boldsymbol{\beta} \int \cdots \int f(\mathbf{x})\, d\mathbf{x} \\ \pause &=& \boldsymbol{\beta} \cdot 1 = \boldsymbol{\beta}. \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Conditional size $\alpha$ test, Critical region $A$} \pause %\framesubtitle{} {\LARGE \begin{displaymath} Pr\{F \in A | \mathbf{X}=\mathbf{x} \} = \alpha \end{displaymath} \pause } % End size % \vspace{3mm} \begin{eqnarray*} Pr\{F \in A \} &=& \int \cdots \int Pr\{F \in A | \mathbf{X}=\mathbf{x} \} f(\mathbf{x})\, d\mathbf{x} \\ \pause &=& \int \cdots \int \alpha f(\mathbf{x})\, d\mathbf{x} \\ \pause &=& \alpha \int \cdots \int f(\mathbf{x})\, d\mathbf{x} \\ \pause &=& \alpha \end{eqnarray*} \end{frame} \begin{frame} \frametitle{The moral of the story} \pause %\framesubtitle{} \begin{itemize} \item Don't worry. \pause \item Even though $X$ variables are often random, we can apply the usual fixed-$x$ model without fear. \pause \item Estimators are still unbiased. \pause \item Tests have the right Type I error probability. \pause \item Similar arguments apply to confidence intervals and prediction intervals. \pause \item And it's all distribution-free with respect to $X$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf17} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf17}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # For scatterplot slides HS_GPA <- round(rnorm(100,80,7)); sort(HS_GPA) HS_GPA[HS_GPA>100] <- 100 Univ_GPA <- round(5 + .9 * HS_GPA + rnorm(100,0,5)); sort(Univ_GPA) cbind(HS_GPA,Univ_GPA) b <- coefficients(lm(Univ_GPA~HS_GPA)); b; b[1] x1 <- 60; x2 <- 97 y1 <- b[1] + b[2] * x1 y2 <- b[1] + b[2] * x2 plot(HS_GPA,Univ_GPA) lines(c(x1,x2),c(y1,y2)) % 3-d x1 = c(0,0,1,1); x2 = c(0,1,0,1) plot(x1,x2,pch=' ',xlab=expression(x[1]),ylab=expression(x[2])) text(1,0,'A'); text(0,1,'B'); text(0,0,'C')