\documentclass[11pt]{article} %\usepackage{amsbsy} % for \boldsymbol and \pmb \usepackage{graphicx} % To include pdf files! \usepackage{amsmath} \usepackage{amsbsy} \usepackage{amsfonts} \usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=blue]{hyperref} % For links % \usepackage{fullpage} %\pagestyle{empty} % No page numbers % To use more of the top and bottom margins than fullpage \oddsidemargin=-.2in % Good for US Letter paper \evensidemargin=-.2in \textwidth=6.6in \topmargin=-1.1in \headheight=0.2in \headsep=0.5in \textheight=9.4in \begin{document} %\enlargethispage*{1000 pt} \begin{center} {\Large \textbf{STA 2053 Assignment 2 (Large-sample and Regression)}}\footnote{This assignment was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/brunner/oldclass/2053f22} {\texttt{http://www.utstat.toronto.edu/brunner/oldclass/2053f22}}} \vspace{1 mm} \end{center} \noindent The paper and pencil questions are not to be handed in. They are practice for the quiz on October 17th. Bring hard copy of your input and output for Question~\ref{num} to the quiz. It may be handed in. \begin{enumerate} % Large-sample first \item Let $X_1 , \ldots, X_n$ be a random sample from a Binomial distribution with parameters $3$ and $\theta$. That is, \begin{displaymath} P(X_i = x_i) = \binom{3}{x_i} \theta^{x_i} (1-\theta)^{3-x_i}, \end{displaymath} for $x_i=0,1,2,3$. Choose a reasonable estimator of $\theta$, and prove that it is strongly consistent. Where you get your estimator does not really matter, but please state how you thought of it. This question can be quick. \item Let $X_1 , \ldots, X_n$ be a random sample from a continuous distribution with density \begin{displaymath} f(x;\tau) = \frac{\tau^{1/2}}{\sqrt{2\pi}} \, e^{-\frac{\tau x^2}{2}}, \end{displaymath} where the parameter $\tau>0$. Let \begin{displaymath} \widehat{\tau} = \frac{n}{\sum_{i=1}^n X_i^2}. \end{displaymath} Is $ \widehat{\tau}$ consistent for $\tau$? Answer Yes or No and prove your answer. Hint: You can just write down $E(X^2)$ by inspection. This is a very familiar distribution; have confidence! \item Let $X_1 , \ldots, X_n$ be a random sample from a Gamma distribution with $\alpha=\beta=\theta>0$. That is, the density is \begin{displaymath} f(x;\theta) = \frac{1}{\theta^\theta \Gamma(\theta)} e^{-x/\theta} x^{\theta-1}, \end{displaymath} for $x>0$. Let $\widehat{\theta} = \overline{X}_n$. Is $ \widehat{\theta}$ consistent for $\theta$? Answer Yes or No and prove your answer. \item \label{thruorigin} Independently for $i = 1 , \ldots, n$, let \begin{displaymath} Y_i = \beta X _i + \epsilon_i, \end{displaymath} where $E(X_i)=\mu$, $E(\epsilon_i)=0$, $Var(X_i)=\sigma^2_x$, $Var(\epsilon_i)=\sigma^2_\epsilon$, and $\epsilon_i$ is independent of $X_i$. Let \begin{displaymath} \widehat{\beta} = \frac{\sum_{i=1}^n X_i Y_i}{\sum_{i=1}^n X_i^2}. \end{displaymath} Is $ \widehat{\beta}$ consistent for $\beta$? Answer Yes or No and prove your answer. \item Another Method of Moments estimator for Problem~\ref{thruorigin} is $\widehat{\beta}_2 = \frac{\overline{Y}_n}{\overline{X}_n}$. \begin{enumerate} \item Show that $\widehat{\beta}_2 \stackrel{p}{\rightarrow} \beta$ in most of the parameter space. \item However, consistency means that the estimator converges to the parameter in probability \emph{everywhere} in the parameter space. Where does $\widehat{\beta}_2$ fail, and why? \end{enumerate} \newpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \item \label{covconsistent} Let $(X_1, Y_1), \ldots, (X_n,Y_n)$ be a random sample from a bivariate distribution with $E(X_i)=\mu_x$, $E(Y_i)=\mu_y$, $Var(X_i)=\sigma^2_x$, $Var(Y_i)=\sigma^2_y$, and $Cov(X_i,Y_i)=\sigma_{xy}$. \begin{enumerate} \item Show that the sample covariance $S_{xy} = \frac{\sum_{i=1}^n(X_i-\overline{X})(Y_i-\overline{Y})}{n-1}$ is a consistent estimator of $\sigma_{xy}$. \item Show that the sample covariance (with $n$ in the denominator, just for convenience) has a large-sample normal distribution. Give the asymptotic mean and covariance. Cite the Slutsky lemmas as you use them. They will be supplied with the quiz if necessary. \end{enumerate} %\vspace{3mm} %\hrule %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % \vspace{2mm} \item The usual univariate multiple regression model with independent normal errors is \begin{displaymath} \mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}, \end{displaymath} where $\mathbf{X}$ is an $n \times p$ matrix of known constants, $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants, and $\boldsymbol{\epsilon}$ is multivariate normal with mean zero and covariance matrix $\sigma^2 \mathbf{I}_n$, with $\sigma^2 > 0$ an unknown constant. But of course in practice, the explanatory variables are random, not fixed. Clearly, if the model holds \emph{conditionally} upon the values of the explanatory variables, then all the usual results hold, again conditionally upon the particular values of the explanatory variables. The probabilities (for example, $p$-values) are conditional probabilities, and the $F$ statistic does not have an $F$ distribution, but a conditional $F$ distribution, given $\mathbf{X=x}$. \begin{enumerate} \item Show that the least-squares estimator $\widehat{\boldsymbol{\beta}}= (\mathbf{X}^{\top}\mathbf{X})^{-1}\mathbf{X}^{\top}\mathbf{y}$ is conditionally unbiased. \item Show that $\widehat{\boldsymbol{\beta}}$ is also unbiased unconditionally. \item A similar calculation applies to the significance level of a hypothesis test. Let $F$ be the test statistic (say for an extra-sum-of-squares $F$-test), and $f_c$ be the critical value. If the null hypothesis is true, then the test is size $\alpha$, conditionally upon the explanatory variable values. That is, $P(F>f_c|\mathbf{X=x})=\alpha$. Find the \emph{unconditional} probability of a Type I error. Assume that the explanatory variables are discrete, so you can write a multiple sum. \end{enumerate} \item \label{omittedvars} In the following regression model, the explanatory variables $X_1$ and $X_2$ are random variables. The true model is \begin{displaymath} Y_i = \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \epsilon_i, \end{displaymath} independently for $i= 1, \ldots, n$, where $\epsilon_i \sim N(0,\sigma^2)$. The mean and covariance matrix of the explanatory variables are given by \begin{displaymath} E\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \end{array} \right) = \left( \begin{array}{c} \mu_1 \\ \mu_2 \end{array} \right) \mbox{~~ and ~~} Var\left( \begin{array}{c} X_{i,1} \\ X_{i,2} \end{array} \right) = \left( \begin{array}{rr} \phi_{11} & \phi_{12} \\ \phi_{12} & \phi_{22} \end{array} \right) \end{displaymath} Unfortunately $X_{i,2}$, which has an impact on $Y_i$ and is correlated with $X_{i,1}$, is not part of the data set. Since $X_{i,2}$ is not observed, it is absorbed by the intercept and error term, as follows. \begin{eqnarray*} Y_i &=& \beta_0 + \beta_1 X_{i,1} + \beta_2 X_{i,2} + \epsilon_i \\ &=& (\beta_0 + \beta_2\mu_2) + \beta_1 X_{i,1} + (\beta_2 X_{i,2} - \beta_2 \mu_2 + \epsilon_i) \\ &=& \beta^\prime_0 + \beta_1 X_{i,1} + \epsilon^\prime_i. \end{eqnarray*} The primes just denote a new $\beta_0$ and a new $\epsilon_i$. It was necessary to add and subtract $\beta_2 \mu_2$ in order to obtain $E(\epsilon^\prime_i)=0$. And of course there could be more than one omitted variable. They would all get swallowed by the intercept and error term, the garbage bins of regression analysis. \begin{enumerate} \item Make a path diagram of this model. \item What is $Cov(X_{i,1},\epsilon^\prime_i)$? \item Calculate the variance-covariance matrix of $(X_{i,1},Y_i)$ under the true model. \item Suppose we want to estimate $\beta_1$. The usual least squares estimator is \begin{displaymath} \widehat{\beta}_1 = \frac{\sum_{i=1}^n(X_{i,1}-\overline{X}_1)(Y_i-\overline{Y})} {\sum_{i=1}^n(X_{i,1}-\overline{X}_1)^2}. \end{displaymath} You may just use this formula; you don't have to derive it. Is $\widehat{\beta}_1$ a consistent estimator of $\beta_1$ (meaning for all points in the parameter space) if the true model holds? Answer Yes or No and show your work. Remember, $X_2$ is not available, so you are doing a regression with one explanatory variable. You may use the consistency of the sample variance and covariance without proof. \item Are there \emph{any} points in the parameter space for which $\widehat{\beta}_1 \stackrel{p}{\rightarrow} \beta_1$ when the true model holds? \end{enumerate} \item Ordinary least squares is often applied to data sets where the explanatory variables are best modeled as random variables. \begin{enumerate} \item In the usual regression model with normal errors, what is the conditional distribution of $\epsilon_i$ given $\mathbf{X}_i=\mathbf{x}_i$? \item In what way does the usual conditional linear regression model imply that (random) explanatory variables have zero covariance with the error term? Hint: Assume $\mathbf{X}_i$ as well as $\epsilon_i$ continuous to make the notation easier. \item Show that for simple regression (one explanatory variable), $E(\epsilon_i|X_i=x_i)=0$ for all $x_i$ implies $Cov(X_i,\epsilon_i)=0$, so that a standard regression model without the normality assumption still implies zero covariance (though not necessarily independence) between the error term and explanatory variables. I did a double expectation conditioning on $X_i$. \item Given the results of Problem \ref{omittedvars}, is it ever safe to assume that random explanatory variables have zero covariance with the error term? \end{enumerate} \item Women and men are coming into a store according to independent Poisson processes with rates $\lambda_1$ for women and $\lambda_2$ for men. You don't have to know anything about Poisson processes to do this question. We have that the number of women and the number of men entering the store in a given time period are independent Poisson random variables, with expected values $\lambda_1$ for women and $\lambda_2$ for men. Because the Poisson process is an independent increments process, we can treat the numbers from $n$ time periods as a random sample. Management wants to know the expected number of male customers and the expected number of female customers. Unfortunately, the total numbers of customers were recorded, but not their sex. Let $y_1, \ldots, y_n$ denote the total numbers of customers who enter the store in $n$ time periods. That's all the data we have. \begin{enumerate} \item What is the distribution of $y_i$? If you know the answer, just write it down without proof. \item What is the parameter space? \item \label{analyt} Find the MLE of the parameter vector $(\lambda_1,\lambda_2)$ analytically. Show your work. \item \label{num} For the data in \href{https://www.utstat.toronto.edu/brunner/openSEM/data/poisson.data.txt} {\texttt{https://www.utstat.toronto.edu/brunner/openSEM/data/poisson.data.txt}}, find the MLE numerically. \emph{Try two different starting values.} Does your answer agree with your answer to~\ref{analyt}? \end{enumerate} \end{enumerate} % End of questions \vspace{2mm} \noindent Please bring your \emph{complete} R printout from Question~\ref{num} to the quiz, showing all input and output. It may be handed in. \end{document}