% 431s23Assignment4.tex     Testing, Random explanatory variables
\documentclass[11pt]{article} 
%\usepackage{amsbsy} % for \boldsymbol and \pmb 
\usepackage{graphicx} % To include pdf files!
\usepackage{amsmath}
\usepackage{amsbsy}
\usepackage{amsfonts}
\usepackage{comment}
\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=blue]{hyperref} % For links
\usepackage{fullpage}
%\pagestyle{empty} % No page numbers


\begin{document}

%\enlargethispage*{1000 pt} 


\begin{center}   
{\Large \textbf{STA 431s23 Assignment Four}}\footnote{This assignment was prepared by  \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner},
Department of Statistical Sciences, University of Toronto. It is licensed under a 
\href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US}
     {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website:
\href{http://www.utstat.toronto.edu/brunner/oldclass/431s23} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}}}
\vspace{1 mm}
\end{center}

\noindent
\emph{For the Quiz on Friday Feb.~10th, please bring printouts of your full R input for Questions~\ref{Rbeta} and~\ref{simple-again}. The other problems are not to be handed in. They are practice for the Quiz.}
\vspace{2mm}
\hrule

\begin{enumerate} 

\item Independently for $i = 1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + \epsilon_1$, where $E(x_{i1}) = \mu_{x1}, E(x_{i2}) = \mu_{x2}, Var(x_{i1}) = \phi_{11}, Var(x_{i2}) = \phi_{22}, Cov(x_{i1},x_{i2}) = \phi_{12}, Var(\epsilon_i) = \psi$, and $\epsilon_i$ is independent of $x_{i1}$ and $x_{i2}$.
    \begin{enumerate}
        \item What is the parameter $\theta$ for this model?
        \item What is the parameter space $\Theta$?
        \item What is the restricted parameter space $\Theta_0$ under $H_0: \beta_1 = \beta_2$ and $\phi_{11} = \phi_{22} = \psi = 1$?
        \item The null hypothesis can be written $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$. Give the $\mathbf{L}$ and $\mathbf{h}$ matrices
    \end{enumerate}

\item On the formula sheet, the statement $\widehat{\boldsymbol{\theta}}_n \stackrel{\cdot}{\sim} N(\boldsymbol{\theta},\mathbf{V}_n)$ is saying that the vector of maximum likelihood estimates is asymptotically normal. That is, for large sample sizes, $\widehat{\boldsymbol{\theta}}_n$ has a probability distribution that is approximately multivariate normal, centered on the vector of true parameter values and with variance-covariance matrix $\mathbf{V}_n$. Generally speaking, the rules (theorems) for exact multivariate normality also apply to asymptotic multivariate normality. It's not rigorous, but you usually arrive at the correct conclusion. Accordingly,
    \begin{enumerate}
        \item Let $\boldsymbol{\theta}$ be $m \times 1$, and let $\mathbf{L}$ be an $r \times m$ matrix of constants with linearly independent rows. This part of the question develops the Wald statistic for testing $H_0: \mathbf{L}\boldsymbol{\theta} = \mathbf{h}$.
            \begin{enumerate}
                \item What is the asymptotic distribution of $\mathbf{L}\widehat{\boldsymbol{\theta}}_n$? Just write it down.
                \item \label{quadform} What is the asymptotic distribution of $(\mathbf{L}\widehat{\boldsymbol{\theta}}_n - \mathbf{L}\boldsymbol{\theta})^\top 
\left(\mathbf{LV}_n \mathbf{L}^\top \right)^{-1} 
(\mathbf{L}\widehat{\boldsymbol{\theta}}_n - \mathbf{L}\boldsymbol{\theta})$? 
                \item What are the dimensions (number of rows and columns) in the matrix $\left(\mathbf{LV}_n \mathbf{L}^\top \right)^{-1}$?
                \item Why is it critical that the rows of $\mathbf{L}$ be linearly independent, so that the rank of $\mathbf{L}$ equals $r$?
                \item Compare the expression in Question~\ref{quadform} to the Wald statistic $W_n$ on the formula sheet. There are two differences. Briefly explain them. 
            \end{enumerate}
        \item Let $\mathbf{a}$ be an $m \times 1$ non-zero vector of constants. What is the asymptotic distribution of $\mathbf{a}^\top \widehat{\boldsymbol{\theta}}_n$?
        \item \label{ci} Based on the last result, give a $(1-\alpha)100\%$ confidence interval for $\mathbf{a}^\top \boldsymbol{\theta}$. Use $z_{\alpha/2}$ to denote the value that cuts off the top $\alpha/2$ of the standard normal distribution (For example for $\alpha=0.05$, $z_{\alpha/2} = 1.96$). Show some work. Why are you using $\widehat{\mathbf{V}}_n$ instead of $\mathbf{V}_n$?
        \item Using the same standard error, write down a $z$ statistic for testing $H_0: \mathbf{a}^\top \boldsymbol{\theta} = h$.
        \item Show that for $H_0: \mathbf{a}^\top \boldsymbol{\theta} = h$, $W_n = z^2$. 
    \end{enumerate}

\item \label{Rbeta} Let $x_1, \ldots, x_n$ be a random sample from a beta distribution. The density is
\begin{displaymath}
    f(x) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha) \Gamma(\beta)} \, x^{\alpha-1} \, (1-x)^{\beta-1}
\end{displaymath}
for $0<x<1$, where $\alpha$ and $\beta$ are both greater than zero.     
Numerical data are available 
\href{http://www.utstat.toronto.edu/brunner/openSEM/data/beta24.data.txt}
     {HERE}.
You can get a copy of the data with 

{\small
\begin{center}
\texttt{x = scan("https://www.utstat.toronto.edu/brunner/openSEM/data/beta24.data.txt")}
\end{center}
} % End size

    \begin{enumerate}
        \item Find the maximum likelihood estimates of $\alpha$ and $\beta$. 
        \item Test $H_0: \beta = 2 \alpha$ with a large-sample likelihood ratio test. Your output should include the $G^2$ statistic, the degrees of freedom, and the $p$-value. Is the null hypothesis rejected at the 0.05 significance level? What, if anything, do you conclude?
        \item Test the same null hypothesis, this time with a Wald test. Your output should include the $W_n$ statistic, the degrees of freedom, and the $p$-value. Is the null hypothesis rejected at the 0.05 significance level? What, if anything, do you conclude?
        \item Give a 95\% confidence interval for the quantity $2\alpha-\beta$. Your answer is a set of two numbers, the lower confidence limit and the upper confidence limit. Hint: Does this remind you of Problem~\ref{ci}?
%        \item What is the connection of the confidence interval to the significance tests you have done?
    \end{enumerate}
\textbf{Please bring a printout of your full R input and output to the quiz.}


\vspace{3mm}
\hrule
% \vspace{2mm}

% Simulation: Base it on Q14 of A3.
\item \label{simple-again} Independently for $i=1, \ldots, n$, let $y_i = \beta x_i + \epsilon_i$, where $x_i \sim N(\mu_x,\sigma^2_x)$, $\epsilon_i \sim N(0,\sigma^2_\epsilon)$, and  $x_i$ and $\epsilon_i$ are independent. This is the model of Question~14 in Assignment~3.
    \begin{enumerate}
        \item Use R to simulate a data set from this model. The true parameter values and the sample size are up to you, but the sample size should be large. 
        \item In Question~14 of Assignment~3, you found two method of moments estimators for $\beta$. They were
\begin{displaymath}
    \widehat{\beta}_1 = \frac{\overline{y}_n}{\overline{x}_n} \mbox{ and }
    \widehat{\beta}_2 = \frac{\sum_{i=1}^n (x_i-\overline{x}_n)(y_i-\overline{y}_n)} 
                             {\sum_{i=1}^n (x_i-\overline{x}_n)^2}
\end{displaymath}
% Note that the OLS estimator is yet again another consistent MOM.
Calculate $\widehat{\beta}_1$ and $\widehat{\beta}_2$ for your simulated data. Which estimate comes closer to the truth? Of course you would have to carry out this experiment a large number of times to determine whether one of them is better in general.
    \end{enumerate}
\textbf{Please bring a printout of your full R input and output to the quiz.}

\newpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\enlargethispage*{1000 pt}

\item The usual univariate multiple regression model with independent normal errors is 
\begin{displaymath}
    \mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon},
\end{displaymath}
where $\mathbf{X}$ is an $n \times p$ matrix of known constants, $\boldsymbol{\beta}$ is a $p \times 1$ vector of unknown constants, and $\boldsymbol{\epsilon}$ is multivariate normal with mean zero and covariance matrix $\sigma^2 \mathbf{I}_n$, with $\sigma^2 > 0$ an unknown constant. But of course in practice, the explanatory variables are random, not fixed. Clearly, if the model holds \emph{conditionally} upon the values of the explanatory variables, then all the usual results hold, again conditionally upon the particular values of the explanatory variables. The probabilities (for example, $p$-values) are conditional probabilities, and the $F$ statistic does not have an $F$ distribution, but a conditional $F$ distribution, given $\boldsymbol{\mathcal{X}} = \mathbf{X}$. 
        \begin{enumerate}
            \item Show that the least-squares estimator $\widehat{\boldsymbol{\beta}}= (\mathbf{X}^{\top}\mathbf{X})^{-1}\mathbf{X}^{\top}\mathbf{y}$ is conditionally unbiased. 
            \item Show that $\widehat{\boldsymbol{\beta}}$ is also unbiased unconditionally. 
            \item A similar calculation applies to the significance level of a hypothesis test. Let $F$ be the test statistic (say for an extra-sum-of-squares $F$-test), and $f_c$ be the critical value. If the null hypothesis is true, then the test is size $\alpha$, conditionally upon the explanatory variable values. That is, $P(F>f_c|\boldsymbol{\mathcal{X}} = \mathbf{X})=\alpha$. Find the \emph{unconditional} probability of a Type I error. Assume that the explanatory variables are discrete, so you can write a multiple sum. 
        \end{enumerate}


\item The point of this question is that under conditions that are fairly common and natural, MLEs and likelihood ratio tests based on a fixed-$x$ regression model are also valid for a random-$x$ model. For notational convenience, suppose that the model parameter $\theta = (\theta_1,\theta_2)$, and that the joint density/probability mass function of the data can be written
\begin{displaymath}
    f_\theta(x,y) = g_{\theta_1}(y|x) \, h_{\theta_2}(x), where
\end{displaymath}
    \begin{itemize}
        \item $f_\theta(x,y)$ is the joint density of $x$ and $y$. It depends on the entire parameter vector $\theta$.
        \item $g_{\theta_1}(y|x)$ is the conditional density of $y$ given $x$. It depends on $\theta_1$.
        \item $h_{\theta_2}(x)$ is the marginal density of $x$. It depends on $\theta_2$.
    \end{itemize}
The quantities $x$, $y$, $\theta_1$ and $\theta_2$ could all be vectors. There must be no functional connection between $\theta_1$ and $\theta_2$. For example in a regression, we might have $\theta_1 = (\boldsymbol{\beta},\sigma^2)$, and if $h_{\theta_2}(x)$ is a multivariate normal density, $\theta_2$ would be the unique elements of $\boldsymbol{\mu}_x$ and $\boldsymbol{\Sigma}_x$. The lack of functional connection between $\theta_1$ and $\theta_2$ just means there are no $\beta_j$ parameters and no $\sigma^2$ in $\boldsymbol{\mu}_x$ or $\boldsymbol{\Sigma}_x$. Usually, we only care about the parameters in $\theta_1$.
    \begin{enumerate}
        \item Writing the full likelihood as $L(\theta) = \pi_{i=1}^n f_\theta(x_i,y_i)$, show that $\widehat{\theta_1}$ for the random-$x$ model is the same as for the model of $y$ conditional on $x$. It's easiest to see if you take the log of the likelihood and start differentiating.
        \item Now consider a likelihood ratio test that \emph{only} restricts $\theta_1$. In regression, it would be about the $\beta_j$ parameters. Show that the likelihood ratio test statistic
\begin{displaymath}
    G^2 = -2 \ln \left( \frac{L(\widehat{\theta}_0)}{  L(\widehat{\theta})  }\right)
\end{displaymath}
for the random-$x$ model is the same as for the model of $y$ conditional on $x$.
    \end{enumerate}

\end{enumerate} % End of all the questions

\vspace{3mm}
\noindent
\textbf{Please bring printouts of your full R input and output for Questions \ref{Rbeta} and \ref{simple-again} to the quiz.}


\end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% I guess these last 2 are about omitted variables. Maybe wait until next time.


     \item %Ordinary least squares is often applied to data sets where 
A useful way to write a fixed-$x$ regression model is $y_i = \boldsymbol{\beta}^\top\mathbf{x}_i + \epsilon_i$, where $\mathbf{x}_i$ is a $p \times 1$ vector of constants. Of course usually the explanatory variables are best modeled as random variables. So, the model really should be 
$y_i = \boldsymbol{\beta}^\top \boldsymbol{\mathcal{X}}_i + \epsilon_i$, and the usual model is conditional on $\boldsymbol{\mathcal{X}_i} = \mathbf{x}_i$.

In what way does the usual conditional linear regression model imply that (random) explanatory variables have zero covariance with the error term? For notational convenience, assume $\boldsymbol{\mathcal{X}}_i$ as well as $\epsilon_i$ continuous. What is the conditional distribution of $\epsilon_i$ given $\boldsymbol{\mathcal{X}_i} = \mathbf{x}_i$?

     \item In a regression with one explanatory variable, show that $E(\epsilon_i|X_i=x_i)=0$ for all $x_i$ implies $Cov(X_i,\epsilon_i)=0$, so that \emph{a standard regression model without the normality assumption still implies zero covariance} (though not necessarily independence) \emph{between the error term and explanatory variables.} Hint: If you get stuck, the matrix version of this calculation is in the text.

% Call this random explanatory -- or maybe wait until identifiability. 

\item Independently for $i=1, \ldots, n$, let $\mathbf{y}_i = \boldsymbol{\beta}_0 + \boldsymbol{\beta}_1 \mathbf{x}_i +   \boldsymbol{\epsilon}_i$, where
\begin{itemize}
    \item $\mathbf{y}_i$ is an $q \times 1$ random vector of observable response variables; there are $q$ response variables. 
    \item $\mathbf{x}_i$ is a $p \times 1$ observable random vector; there are $p$ explanatory variables. $E(\mathbf{x}_i) = \boldsymbol{\mu}_x$ and $cov(\mathbf{x}_i) = \boldsymbol{\Phi}_{p \times p}$. The positive definite matrix $\boldsymbol{\Phi}$ is unknown. 
    \item $\boldsymbol{\beta}_0$ is a  $q \times 1$ matrix of unknown constants.
    \item $\boldsymbol{\beta}_1$ is a  $q \times p$ matrix of unknown constants.  
    \item $\boldsymbol{\epsilon}_i$ is a $q \times 1$ random vector with expected value zero and unknown positive definite variance-covariance matrix $cov(\boldsymbol{\epsilon}_i) = \boldsymbol{\Psi}_{q \times q}$.  
    \item $\boldsymbol{\epsilon}_i$ is independent of $\mathbf{x}_i$.
\end{itemize}
Letting $\mathbf{d}_i = \left(\begin{array}{c} \mathbf{x}_i  \\ \hline  \mathbf{y}_i \end{array} \right)$, we have 
$cov(\mathbf{d}_i) = \boldsymbol{\Sigma} = \left( \begin{array}{c|c}
                \boldsymbol{\Sigma}_x & \boldsymbol{\Sigma}_{xy} \\ \hline
                \boldsymbol{\Sigma}_{yx} & \boldsymbol{\Sigma}_y    
                                                 \end{array} \right)$, and 
$\widehat{\boldsymbol{\Sigma}} = \left( \begin{array}{c|c}
        \widehat{\boldsymbol{\Sigma}}_x & \widehat{\boldsymbol{\Sigma}}_{xy} \\ \hline
        \widehat{\boldsymbol{\Sigma}}_{yx} & \widehat{\boldsymbol{\Sigma}}_y    
                                                 \end{array} \right)$.
    \begin{enumerate}
        \item Give the dimensions (number of rows and columns) of the following matrices: \\
$\mathbf{d}_i$, $\boldsymbol{\Sigma}$, $\boldsymbol{\Sigma}_{x}$, $\boldsymbol{\Sigma}_{y}$, $\boldsymbol{\Sigma}_{xy}$, $\boldsymbol{\Sigma}_{yx}$.
        \item Write the parts of $\boldsymbol{\Sigma}$ in terms of the unknown parameter matrices.
        \item Give a Method of Moments Estimator for $\boldsymbol{\Phi}$. Just write it down. 
        \item Obtain formulas for the Method of Moments Estimators of $\boldsymbol{\beta}_1$, $\boldsymbol{\beta}_0$ and $\boldsymbol{\Psi}$. Show your work. You may give $\widehat{\boldsymbol{\beta}}_0$   in terms of $\widehat{\boldsymbol{\beta}}_1$, but simplify $\widehat{\boldsymbol{\Psi}}$.
        \item If the distributions of $\mathbf{x}_i$ and $\boldsymbol{\epsilon}_i$ are multivariate normal, how do you know that your Method of Moments estimates are also the MLEs?
    \end{enumerate} % End of multivariate regression question