\documentclass[11pt]{article} %\usepackage{amsbsy} % for \boldsymbol and \pmb \usepackage{graphicx} % To include pdf files! \usepackage{amsmath} \usepackage{amsbsy} \usepackage{amsfonts} \usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=blue]{hyperref} % For links \usepackage{fullpage} %\pagestyle{empty} % No page numbers \begin{document} %\enlargethispage*{1000 pt} \begin{center} {\Large \textbf{STA 2101/442 Assignment Four}}\footnote{Copyright information is at the end of the last page.} \vspace{1 mm} \end{center} \noindent One version of the general linear model with fixed effects is $\mathbf{y}= X\boldsymbol{\beta}+\boldsymbol{\epsilon}$, where \begin{itemize} \item $X$ is an $n \times p$ matrix of known constants with $n>p$ and the columns of $X$ linearly independent. \item $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants. \item $\boldsymbol{\epsilon}$ is an $n \times 1$ random vector with $E(\boldsymbol{\epsilon}) = \mathbf{0}$ and $cov(\boldsymbol{\epsilon}) = \sigma^2 I_n$. \item $\sigma^2>0$ is an unknown constant. \end{itemize} The least-squares estimate of $\boldsymbol{\beta}$ is $\widehat{\boldsymbol{\beta}} = (X^\top X)^{-1} X^\top \mathbf{y}$, the vector of predicted $y$ values is $\widehat{\mathbf{y}} = X\widehat{\boldsymbol{\beta}}$, and the vector of residuals is $\mathbf{e} = \mathbf{y} - \widehat{\mathbf{y}}$. \noindent % The questions are just practice for the quiz, and are not to be handed in. Use R as necessary for Question~\ref{integration}, and \textbf{bring your printout to the quiz.} \vspace{1mm} \begin{enumerate} \item Give the dimensions (number of rows and number of columns) of the following matrices. \begin{enumerate} \item $\mathbf{y}$ \item $\boldsymbol{\beta}$ \item $X\boldsymbol{\beta}$ \item $(X^\top X)^{-1}$ \item $\widehat{\boldsymbol{\beta}}$ \item $\widehat{\mathbf{y}}$ \item $\mathbf{e}$ \item $\mathbf{e}^\top\mathbf{e}$ \item $\boldsymbol{\epsilon}\boldsymbol{\epsilon}^\top$ \item $X^\top \mathbf{e}$ \end{enumerate} \item \label{perpendicular} Show $X^\top \mathbf{e} = \mathbf{0}$. \item Why does $X^\top\mathbf{e}=\mathbf{0}$ tell you that if a regression model has an intercept, the residuals must add up to zero? \item \label{ls} Let $\mathcal{S} = (\mathbf{y}-X\boldsymbol{\beta})^\top (\mathbf{y}-X\boldsymbol{\beta})$. Note that this is the sum of squared differences between the $y_i$ observations and their expected values, and the $\beta_j$ values that minimize it are the least squares estimates. \begin{enumerate} \item Show that $\mathcal{S} = (\mathbf{y}-X\widehat{\boldsymbol{\beta}})^\top (\mathbf{y}-X\widehat{\boldsymbol{\beta}}) + (\widehat{\boldsymbol{\beta}}-\boldsymbol{\beta})^\top (X^\top X) (\widehat{\boldsymbol{\beta}}-\boldsymbol{\beta})$. Hint: Add and subtract $\widehat{\mathbf{y}}$. \item Why does this imply that the minimum of $\mathcal{S}(\boldsymbol{\beta})$ occurs at $\boldsymbol{\beta} = \widehat{\boldsymbol{\beta}}$? \item The columns of $X$ are linearly independent. Why does linear independence guarantee that the minimum is unique? \item Isn't it nice to be able to do this without calculus? \end{enumerate} \newpage \item The ``hat" matrix is $H = X(X^\top X)^{-1} X^\top$. \begin{enumerate} \item What are the dimensions (number of rows and columns) of $H$? \item Show that $H$ is symmetric. \item Show that $H$ is idempotent, meaning $H = H^2$. \item Show that $\widehat{\mathbf{y}} = H\mathbf{y}$. \item Using $tr(AB)=tr(BA)$, find $tr(H)$. (That's the trace, sum of the diagonal elements.) \item Show that $I-H$ is symmetric. \item Show that $I-H$ is idempotent. \item Show that $X^\top(I-H) = \mathbf{0}$. \item Show that $\mathbf{e} = (I-H)\mathbf{y}$. \item \label{Me} Show that $\mathbf{e}=(I-H)\boldsymbol{\epsilon}$. \item Find $tr(I-H)$. \end{enumerate} \item In simple regression through the origin, there is one explanatory variable and no intercept. The model is $y_i = \beta_1 x_i + \epsilon_i$. \begin{enumerate} \item \label{calc} Find the least squares estimator of $\beta_1$ with calculus. \item What is the $X$ matrix? \item What is $X^\top X$? \item What is $X^\top \mathbf{y}$? \item What is $(X^\top X)^{-1}$? \item What is $\widehat{\beta}_1 = (X^\top X)^{-1}X^\top\mathbf{y}$? Compare this with your answer to~\ref{calc}. \end{enumerate} \item There can even be a regression model with an intercept and no explanatory variables. In this case the model would be $y_i = \beta_0 + \epsilon_i$. \begin{enumerate} \item \label{ybar} Find the least squares estimator of $\beta_0$ with calculus. What's a least-squares estimator again? Find the parameter value(s) that make the $y_i$ observations as close as possible to their expected values. \item What is the $X$ matrix? \item What is $X^\top X$? \item What is $X^\top \mathbf{y}$? \item What is $(X^\top X)^{-1}$? \item What is $\widehat{\beta}_0 = (X^\top X)^{-1}X^\top\mathbf{y}$? Compare this with your answer to~\ref{ybar}. \end{enumerate} \newpage \item The set of vectors $\mathcal{V} = \{\mathbf{v} = X\mathbf{a}: \mathbf{a} \in \mathbb{R}^{k+1}\}$ is the subset of $\mathbb{R}^{n}$ consisting of linear combinations of the columns of $X$. That is, $\mathcal{V}$ is the space \emph{spanned} by the columns of $X$. The least squares estimator $\widehat{\boldsymbol{\beta}} = (X^\top X)^{-1}X^\top\mathbf{y}$ was obtained by minimizing $(\mathbf{y}-X\mathbf{a})^\top(\mathbf{y}-X\mathbf{a})$ over all $\mathbf{a} \in \mathbb{R}^{k+1}$. Thus, $\widehat{\mathbf{y}} = X\widehat{\boldsymbol{\beta}}$ is the point in $\mathcal{V}$ that is \emph{closest} to the data vector $\mathbf{y}$. Geometrically, $\widehat{\mathbf{y}}$ is the \emph{projection} (shadow) of $\mathbf{y}$ onto $\mathcal{V}$. The hat matrix $H$ is a \emph{projection matrix}. It projects the image on any point in $\mathbb{R}^{n}$ onto $\mathcal{V}$. Now we will test out several consequences of this idea. \begin{enumerate} \item The shadow of a point already in $\mathcal{V}$ should be right at the point itself. Show that if $\mathbf{v} \in \mathcal{V}$, then $H\mathbf{v}= \mathbf{v}$. \item The vector of differences $\mathbf{e} = \mathbf{y} - \widehat{\mathbf{y}}$ should be perpendicular (at right angles) to each and every basis vector of $\mathcal{V}$. How is this related to Question~\ref{perpendicular}? \item Show that the vector of residuals $\mathbf{e}$ is perpendicular to any $\mathbf{v} \in \mathcal{V}$. \item If $\mathbf{u}$ is a general point in $\mathbb{R}^n$ show that $(I-H)\mathbf{u}$ is perpendicular to any $\mathbf{v} \in \mathcal{V}$ (this includes $\mathbf{e}$). \end{enumerate} \item The linear regression model with intercept can be written in scalar form as $y_i = \beta_0 + \beta_1 x_{i,1} + \cdots + \beta_{p-1} x_{i,p-1} + \epsilon_i$. Defining $SSTO=\sum_{i=1}^n(y_i-\overline{y})^2$, $SSR = \sum_{i=1}^n(\widehat{y}_i-\overline{y})^2$ and $SSE=\sum_{i=1}^n(y_i-\widehat{y}_i)^2$, show $SSTO=SSE+SSR$. \item In this question you will show that $MSE = SSE/(n-p)$ is an unbiased estimator of the error variance $\sigma^2$, using $tr(AB)=tr(BA)$. Start with the fact that the trace of a $1 \times 1$ matrix is just the matrix itself, like this: $E(\mathbf{e}^\top\mathbf{e}) = E(tr(\mathbf{e}^\top\mathbf{e})) \ldots$~~ Please continue; Question~\ref{Me} is helpful. \item Recall that if $\mathbf{w} \sim N_p(\boldsymbol{\mu}, \Sigma)$, then $A\mathbf{w}+\mathbf{c} \sim N_q(A\boldsymbol{\mu}+\mathbf{c}, A\Sigma A^\top)$. If $\boldsymbol{\epsilon} \sim N_n(\mathbf{0},\sigma^2 I_n)$ in the general linear model, give the distributions of the following random vectors. Simplify as much as possible. \begin{enumerate} \item $\mathbf{y}$ \item $\boldsymbol{\beta}$ \item $\widehat{\boldsymbol{\beta}}$ \item $\widehat{\mathbf{y}}$ \item $\mathbf{e}$ \item $X^\top \mathbf{e}$ \end{enumerate} \item Let $\mathbf{X}= (X_1,X_2,X_3)^\top$ be multivariate normal with \begin{displaymath} \boldsymbol{\mu} = \left[ \begin{array}{c} 1 \\ 0 \\ 6 \end{array} \right] \mbox{ and } \boldsymbol{\Sigma} = \left[ \begin{array}{c c c} 1 & 0 & 0 \\ 0 & 2 & 0 \\ 0 & 0 & 1 \end{array} \right] . \end{displaymath} Let $Y_1=X_1+X_2$ and $Y_2=X_2+X_3$. Find the joint distribution of $Y_1$ and $Y_2$. \item Let $X_1$ be Normal$(\mu_1, \sigma^2_1)$. Let $X_2$ be Normal$(\mu_2, \sigma^2_2)$, independent of $X_1$. What is the joint distribution of $Y_1=X_1+X_2$ and $Y_2=X_1-X_2$? What is required for $Y_1$ and $Y_2$ to be independent? Do it the easy way. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \item High School History classes from across Ontario are randomly assigned to either a discovery-oriented or a memory-oriented curriculum in Canadian history. At the end of the year, the students are given a standardized test and the median score of each class is recorded. Please consider a regression model with these variables: \begin{itemize} \item[$X_1$] Equals 1 if the class uses the discovery-oriented curriculum, and equals 0 if the class uses the memory-oriented curriculum. \item[$X_2$] Average parents' education for the classroom. \item[$X_3$] Average family income for the classroom. \item[$X_4$] Number of university History courses taken by the teacher. \item[$X_5$] Teacher's final cumulative university grade point average. \item[$Y$] Class median score on the standardized history test. \end{itemize} The full regression model (as opposed to the reduced models for various null hypotheses) implies \begin{displaymath} E[Y|X] = \beta_0 + \beta_1X_1 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \beta_5X_5. \end{displaymath} For each question below, please give \begin{itemize} \item The null hypothesis in terms of $\beta$ values. \item $E[Y|X]$ for the reduced model you would use to answer the question. Don't re-number the variables. \end{itemize} \vspace{2mm} \begin{enumerate} \item If you control for parents' education and income and for teacher's university background, does curriculum type affect test scores? (And why is it okay to use the word "affect?") \item Controlling for parents' education and income and for curriculum type, is teacher's university background (two variables) related to their students' test performance? \item Controlling for teacher's university background and for curriculum type, are parents' education and family income (considered simultaneously) related to students' test performance? \item Controlling for curriculum type, teacher's university background and parents' education, is parents' income related to students' test performance? \item Here is one final question. Assuming that $X_1, \ldots, X_5$ are random variables (and I hope you agree that they are), would you expect $X_1$ ro be related to the other explanatory variables? Would you expect the other explanatory variables to be related to each other? \end{enumerate} \newpage \item The U.S. Census Bureau divides the United States into small pieces called census tracts; lots of information is collected about each census tract. The census tracts are grouped into four geographic regions: Northeast, North Central, South and West. In one study, the cases were census tracts, the explanatory variables were Region and average income, and the response variable was crime rate, defined as the number of reported serious crimes in a census tract, divided by the number of people in the census tract. \begin{enumerate} \item Write $E(Y|x)$ for a regression model with parallel regression lines. You do not have to say how your dummy variables are defined. You will do that in the next part. \item Make a table showing how your dummy variables are set up. There should be one row for each region, and a column for each dummy variable. Add a wider column on the right, in which you show $E(Y|x)$. Note that the \emph{symbols} for your dummy variables will not appear in this column. There are examples of this format in the lecture slides and the text.for each region. \item For each of the following questions, give the null hypothesis in terms of the $\beta$ parameters of your regression model. We are not doing one-tailed tests, regardless of how the question is phrased. \begin{enumerate} \item Controlling for income, does average crime rate differ by geographic region? \item Controlling for income, is average crime rate different in the Northeast and North Central regions? \item Controlling for income, is average crime rate different in the Northeast and Western regions? \item Controlling for income, is the crime rate in the South more than the average of the other three regions? \item Controlling for income, is the average crime rate in the Northeast and North Central regions different from the average of the South and West? \item Controlling for geographic region, is crime rate connected to income? \end{enumerate} \end{enumerate} \end{enumerate} \vspace{50mm} \noindent \begin{center}\begin{tabular}{l} \hspace{6in} \\ \hline \end{tabular}\end{center} This assignment was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf16} {\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf16}} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mathbf{Y}~=~\mathbf{X} \boldsymbol{\beta}~+~\boldsymbol{\epsilon} % 42 \begin{itemize} \item $\mathbf{X}$ is an $n \times p$ matrix of known constants \item $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants \item $\boldsymbol{\epsilon} \sim N(\mathbf{0},\sigma^2 \mathbf{I}_n)$ , where $\sigma^2 > 0$ is an unknown constant. \end{itemize} % 24 \widehat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{Y} % 36 \widehat{\mathbf{Y}}=\mathbf{X}\hat{\boldsymbol{\beta}} % 36 \mathbf{e}= (\mathbf{Y}-\widehat{\mathbf{Y}}) % 36