\documentclass[12pt]{article} %\usepackage{amsbsy} % for \boldsymbol and \pmb \usepackage{graphicx} % To include pdf files! \usepackage{amsmath} \usepackage{amsbsy} \usepackage{amsfonts} % for \mathbb{R} The set of reals \usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=blue]{hyperref} % For links \usepackage{fullpage} %\pagestyle{empty} % No page numbers \begin{document} %\enlargethispage*{1000 pt} \begin{center} {\Large \textbf{STA 302f17 Assignment Six}}\footnote{Copyright information is at the end of the last page.} \vspace{1 mm} \end{center} \noindent These problems are preparation for the quiz in tutorial on Thursday November 2nd, and are not to be handed in. \begin{enumerate} \item Show that if $\mathbf{w} \sim N_p(\boldsymbol{\mu},\Sigma)$, with $\Sigma$ positive definite, then $y = (\mathbf{w}-\boldsymbol{\mu})^\prime \Sigma^{-1}(\mathbf{w}-\boldsymbol{\mu})$ has a chi-squared distribution with $p$ degrees of freedom. \item \label{normalsample} Let $y_1, \ldots, y_n$ be a random sample from a $N(\mu,\sigma^2)$ distribution. The sample variance is $s^2 = \frac{\sum_{i=1}^n\left(y_i-\overline{y} \right)^2 }{n-1}$. \begin{enumerate} \item Show $Cov(\overline{y},y_j-\overline{y})=0$ for every $j=1, \ldots, n$. \item How do you know that $\overline{y}$ and $s^2$ are independent? \item Show that \begin{displaymath} \frac{(n-1)s^2}{\sigma^2} \sim \chi^2(n-1). \end{displaymath} Hint: $\sum_{i=1}^n\left(y_i-\mu \right)^2 = \sum_{i=1}^n\left(y_i-\overline{y} + \overline{y} - \mu \right)^2 = \ldots$ \end{enumerate} \item Recall the definition of the $t$ distribution. If $z \sim N(0,1)$, $w \sim \chi^2(\nu)$ and $z$ and $w$ are independent, then $t = \frac{z}{\sqrt{w/\nu}}$ is said to have a $t$ distribution with $\nu$ degrees of freedom, and we write $t \sim t(\nu)$. As in the last question, let $y_1, \ldots, y_n$ be random sample from a $N(\mu,\sigma^2)$ distribution. Show that $t = \frac{\sqrt{n}(\overline{y}-\mu)}{s} \sim t(n-1)$. \item For the general linear regression model with normal error terms, prove that the \\ $ (k+1)\times n$ matrix of covariances $cov(\mathbf{b},\mathbf{e}) = \mathbf{0}$. Why does this show that $SSE = \mathbf{e}^\prime\mathbf{e}$ and $\mathbf{b}$ are independent? \item Calculate $cov(\mathbf{e},\widehat{\mathbf{y}})$; show your work. Why should you have known this answer without doing the calculation, assuming normal error terms? Why does the assumption of normality matter? \item In an earlier Assignment, you proved that \begin{displaymath} (\mathbf{y}-X\boldsymbol{\beta})^\prime (\mathbf{y}-X\boldsymbol{\beta}) = \mathbf{e}^\prime \, \mathbf{e} + (\mathbf{b}-\boldsymbol{\beta})^\prime X^\prime X(\mathbf{b}-\boldsymbol{\beta}). \end{displaymath} Starting with this expression and assuming normality, show that $\mathbf{e}^\prime \, \mathbf{e}/\sigma^2 \sim \chi^2(n-k-1)$. Use the formula sheet. \newpage \item The $t$ distribution is defined as follows. Let $Z\sim N(0,1)$ and $W \sim \chi^2(\nu)$, with $Z$ and $W$ independent. Then $T = \frac{Z}{\sqrt{W/\nu}}$ is said to have a $t$ distribution with $\nu$ degrees of freedom, and we write $T \sim t(\nu)$. For the general fixed effects linear regression model, tests and confidence intervals for linear combinations of regression coefficients are very useful. Derive the appropriate $t$ distribution and some applications by following these steps. Let $\boldsymbol{\ell}$ be a $k+1 \times 1$ vector of constants. \begin{enumerate} \item What is the distribution of $\boldsymbol{\ell}^\prime \mathbf{b}$? Your answer includes both the expected value and the variance. \item Now standardize $\boldsymbol{\ell}^\prime \mathbf{b}$ (subtract off the mean and divide by the standard deviation) to obtain a standard normal. \item Divide by the square root of a well-chosen chi-squared random variable, divided by its degrees of freedom, and simplify. Call the result $t$. \item How do you know numerator and denominator are independent? \item Suppose you wanted to test $H_0: \boldsymbol{\ell}^\prime\boldsymbol{\beta} = \gamma$. Write down a formula for the test statistic. A statistic is a function of the sample data that is \emph{not} a function of any unknown parameters. \item For a regression model with four independent variables, suppose you wanted to test $H_0: \beta_2=0$. Give the vector $\boldsymbol{\ell}$. \item For a regression model with four independent variables, suppose you wanted to test $H_0: \beta_1=\beta_2$. Give the vector $\boldsymbol{\ell}$. \item Consider a data set in which there are $n$ first-year students in ECO100. $x_1$ is High School Calculus mark, $x_2$ is High School grade point average, $x_3$ is score on a test of general mathematical knowledge, and $y$ is mark in ECO100. You seek to estimate expected mark for a student with a 91\% in High School Calculus, a High School GPA of 83\%, and 24 out of 25 on the test. You are estimating $\boldsymbol{\ell}^\prime\boldsymbol{\beta}$. Give the vector $\boldsymbol{\ell}$. \item Letting $t_{\alpha/2}$ denote the point cutting off the top $\alpha/2$ of the $t$ distribution with $n-k-1$ degrees of freedom, derive the $(1-\alpha) \times 100\%$ confidence interval for $\boldsymbol{\ell}^\prime\boldsymbol{\beta}$. ``Derive" means show the High School algebra. \end{enumerate} \newpage % General linear test %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \item For the general linear model with normal errors, \begin{enumerate} \item Let $C$ be an $m \times (k+1)$ matrix of constants with linearly independent rows.What is the distribution of $C\mathbf{b}$? \item If $H_0: C\boldsymbol{\beta} = \boldsymbol{\gamma}$ is true, what is the distribution of $\frac{1}{\sigma^2}(C\mathbf{b}-\boldsymbol{\gamma})^\prime (C(\mathbf{X}^\prime \mathbf{X})^{-1}C^\prime)^{-1} (C\mathbf{b}-\boldsymbol{\gamma})$? Please locate support for your answer on the formula sheet. For full marks, don't forget the degrees of freedom. \item What other facts on the formula sheet allow you to establish the $F$ distribution for the general linear test? The distribution is \emph{given} on the formula sheet, so of course you can't use that. In particular, how do you know numerator and denominator are independent? \end{enumerate} \item \label{tsq} Suppose you wish to test the null hypothesis that a \emph{single} linear combination of regression coefficients is equal to zero. That is, you want to test $H_0: \boldsymbol{\ell}^\prime\boldsymbol{\beta} = 0$. Referring to the formula sheet, verify that $F=t^2$. Show your work. \item The exact way that you express a linear null hypothesis does not matter. Let $A$ be an $m \times m$ nonsingular matrix (meaning $A^{-1}$ exists), so that $C\boldsymbol{\beta} = \boldsymbol{\gamma}$ if and only if $AC\boldsymbol{\beta} = A\boldsymbol{\gamma}$. This is a useful way to express a logically equivalent linear null hypothesis. Show that the general linear test statistic $F$ for testing $H_0: (AC)\boldsymbol{\beta} = A\boldsymbol{\gamma}$ is the same as the one for testing $H_0: C\boldsymbol{\beta} = \boldsymbol{\gamma}$. % Initial test %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \item For the general linear regression model with normal error terms, show that if the model has an intercept, then $\mathbf{e}$ and $\overline{y}$ are independent. If you can show that $\overline{y}$ is a function of $\mathbf{b}$, you are done (why?). Here are some ingredients to start you out. For the model with intercept, \begin{enumerate} \item What does $X^\prime\mathbf{e} = \mathbf{0}$ tell you about $\sum_{i=1}^n e_i$? \item Therefore what do you know about $\sum_{i=1}^n y_i$ and $\sum_{i=1}^n \widehat{y}_i$? \item Now show that $\mathbf{e}$ and $\overline{y}$ are independent. \end{enumerate} \item Carefully examine the formulas for $SST=SSE+SSR$ on the formula sheet. How do you know that $SSR$ and $SSE$ are independent if the model has an intercept? \item Continue assuming that the regression model has an intercept. Many statistical programs automatically provide an \emph{overall} test that says none of the independent variables makes any difference. If you can't reject that, you're in trouble. Supposing $H_0: \beta_1 = \cdots = \beta_k = 0$ is true, \begin{enumerate} \item What is the distribution of $y_i$ under $H_0$? \item What is the distribution of $\frac{SST}{\sigma^2}$? Just write down the answer. Check Problem~\ref{normalsample}. \end{enumerate} \item Still assuming $H_0: \beta_1 = \cdots = \beta_k = 0$ is true and the model has an intercept, what is the distribution of $SSR/\sigma^2$? Use the formula sheet and show your work. Don't forget the degrees of freedom. \item \label{Fstat} Recall the definition of the $F$ distribution. If $W_1 \sim \chi^2(\nu_1)$ and $W_2 \sim \chi^2(\nu_2)$ are independent, $F = \frac{W_1/\nu_1}{W_2/\nu_2} \sim F(\nu_1,\nu_2)$. Show that $F = \frac{SSR/k}{SSE/(n-k-1)}$ has an $F$ distribution under $H_0: \beta_1 = \cdots = \beta_k = 0$? Refer to the results of questions above as you use them. \item The null hypothesis $H_0: \beta_1 = \cdots = \beta_k = 0$ is less and less believable as $R^2$ becomes larger. Show that the $F$ statistic of Question~\ref{Fstat} is an increasing function of $R^2$ for fixed $n$ and $k$. This means it makes sense to reject $H_0$ for large values of $F$. \end{enumerate} % \vspace{90mm} \noindent \begin{center}\begin{tabular}{l} \hspace{6in} \\ \hline \end{tabular}\end{center} This assignment was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/302f17} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/302f17}} \end{document} \item Show that if $\mathbf{x} \sim N_p(\boldsymbol{\mu},\boldsymbol{\Sigma})$, with $\boldsymbol{\Sigma}$ positive definite, then $y = (\mathbf{x}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{x}-\boldsymbol{\mu})$ has a chi-square distribution with $p$ degrees of freedom. \item Let $x_1, \ldots, x_n$ be a random sample from a $N(\mu,\sigma^2)$ distribution. \begin{enumerate} \item Show $Cov(\overline{x},(x_j-\overline{x}))=0$ for $j=1, \ldots, n$. \item Show that $\overline{x}$ and $s^2$ are independent. \item Show that \begin{displaymath} \frac{(n-1)s^2}{\sigma^2} \sim \chi^2(n-1), \end{displaymath} where $s^2 = \frac{\sum_{i=1}^n\left(X_i-\overline{x} \right)^2 }{n-1}$. Hint: $\sum_{i=1}^n\left(x_i-\mu \right)^2 = \sum_{i=1}^n\left(x_i-\overline{X} + \overline{X} - \mu \right)^2 = \ldots$ \end{enumerate} \item Recall the definition of the $t$ distribution. If $Z\sim N(0,1)$, $W \sim \chi^2(\nu)$ and $Z$ and $W$ are independent, then $t = \frac{Z}{\sqrt{W/\nu}}$ is said to have a $t$ distribution with $\nu$ degrees of freedom, and we write $t \sim t(\nu)$. As in the last question, let $x_1, \ldots, x_n$ be random sample from a $N(\mu,\sigma^2)$ distribution. Show that $t = \frac{\sqrt{n}(\overline{x}-\mu)}{s} \sim t(n-1)$. % Next time