\documentclass[serif]{beamer} % Get Computer Modern math font. \hypersetup{colorlinks,linkcolor=,urlcolor=red} % To create handout using article mode: Comment above and uncomment below (2 places) %\documentclass[12pt]{article} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{amsmath} % For \binom{n}{y} %\usepackage{graphicx} % To include pdf files! %\usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} % Comment this out for handout \title{Large Sample Chi-squared Tests: Part One\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2013} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading: Davison's \emph{Statistical models}} \framesubtitle{It may be a little bit helpful.} \begin{itemize} \item Section 2.2 on Convergence \item Pages 33-35 on the Delta method include the multivariate case \item The multinomial distribution is introduced as a homework problem, Problem 10 on p. 37. \end{itemize} \end{frame} \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} \section{Large-Sample Chi-square} \begin{frame} \frametitle{Large-Sample Chi-square} Let $\mathbf{X} \sim N_p(\boldsymbol{\mu},\boldsymbol{\Sigma})$ then recall \vspace{10mm} {\LARGE \begin{displaymath} (\mathbf{X}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p) \end{displaymath}} \vspace{10mm} It's true asymptotically too. \end{frame} \begin{frame} \frametitle{Using $(\mathbf{X}-\boldsymbol{\mu})^\prime \boldsymbol{\Sigma}^{-1}(\mathbf{X}-\boldsymbol{\mu}) \sim \chi^2 (p)$} Suppose \begin{itemize} \item $\sqrt{n}\left(\mathbf{T}_n - \boldsymbol{\theta} \right) \stackrel{d}{\rightarrow} \mathbf{T} \sim N\left(\mathbf{0},\boldsymbol{\Sigma} \right)$ and \item $\widehat{\boldsymbol{\Sigma}}_n \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}$. \end{itemize} \vspace{5mm} Then approximately as $n \rightarrow \infty$, $\mathbf{T}_n \sim N\left(\boldsymbol{\theta},\frac{1}{n}\boldsymbol{\Sigma} \right)$, and \begin{eqnarray*} & & \left( \mathbf{T}_n - \boldsymbol{\theta} \right)^\prime \left( \frac{1}{n}\boldsymbol{\Sigma} \right)^{-1} \left(\mathbf{T}_n - \boldsymbol{\theta} \right) \sim \chi^2 (p) \\ \\\ & & ~~~~~~~~~~~~~~~~~~ \rotatebox{90}{=} \\ & & n \left( \mathbf{T}_n - \boldsymbol{\theta} \right)^\prime \boldsymbol{\Sigma}^{-1} \left(\mathbf{T}_n - \boldsymbol{\theta} \right) \\ & \approx & n \left( \mathbf{T}_n - \boldsymbol{\theta} \right)^\prime \widehat{\boldsymbol{\Sigma}}_n^{-1} \left(\mathbf{T}_n - \boldsymbol{\theta} \right) \\ & \sim & \chi^2 (p) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Or we could be more precise} Suppose \begin{itemize} \item $\sqrt{n}\left(\mathbf{T}_n - \boldsymbol{\theta} \right) \stackrel{d}{\rightarrow} \mathbf{T} \sim N\left(\mathbf{0},\boldsymbol{\Sigma} \right)$ and \item $\widehat{\boldsymbol{\Sigma}}_n \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}$. \end{itemize} \vspace{5mm} Then $\widehat{\boldsymbol{\Sigma}}_n^{-1} \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}^{-1}$, and by a Slutsky lemma, \begin{displaymath} \left( \begin{array}{c} \sqrt{n}\left(\mathbf{T}_n - \boldsymbol{\theta} \right) \\ \widehat{\boldsymbol{\Sigma}}_n^{-1} \end{array} \right) \stackrel{d}{\rightarrow} \left( \begin{array}{c} \mathbf{T} \\ \boldsymbol{\Sigma}^{-1} \end{array} \right). \end{displaymath} By continuity, \begin{eqnarray*} & & \left( \sqrt{n}(\mathbf{T}_n - \boldsymbol{\theta}) \right)^\prime \widehat{\boldsymbol{\Sigma}}_n^{-1} \sqrt{n}(\mathbf{T}_n - \boldsymbol{\theta}) \\ & = & n \left( \mathbf{T}_n - \boldsymbol{\theta} \right)^\prime \widehat{\boldsymbol{\Sigma}}_n^{-1} \left(\mathbf{T}_n - \boldsymbol{\theta} \right) \\ & \stackrel{d}{\rightarrow} & \mathbf{T}^\prime \boldsymbol{\Sigma}^{-1} \mathbf{T} \\ & \sim & \chi^2 (p) \end{eqnarray*} \end{frame} \begin{frame} \frametitle{If $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$ is true} \framesubtitle{Where $\mathbf{L}$ is $r \times p$ and of full row rank} Asymptotically, $\mathbf{LT}_n \sim N\left(\mathbf{L}\boldsymbol{\theta}, \frac{1}{n}\mathbf{L}\boldsymbol{\Sigma}\mathbf{L}^\prime \right)$. So \begin{eqnarray*} & & \left( \mathbf{LT}_n - \mathbf{L}\boldsymbol{\theta} \right)^\prime \left( \frac{1}{n}\mathbf{L}\boldsymbol{\Sigma}\mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{L}\boldsymbol{\theta} \right) \sim \chi^2 (r) \\ \\\ & & ~~~~~~~~~~~~~~~~~~~~~~ \rotatebox{90}{=} \\ & & n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\boldsymbol{\Sigma}\mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \\ & \approx & n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \\ = W_n & \sim & \chi^2 (r) \end{eqnarray*} \vspace{10mm} Or we could be more precise and use Slutsky lemmas. \end{frame} \begin{frame} \frametitle{Test of $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$} \framesubtitle{Where $\mathbf{L}$ is $r \times p$ and of full row rank} {\LARGE \begin{displaymath} W_n = n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \end{displaymath} } Distributed approximately as chi-squared with $r$ degrees of freedom under $H_0$. \vspace{10mm} If $\mathbf{T}_n$ is the maximum likelihood estimator of $\boldsymbol{\theta}$, it's called a \emph{Wald test} (and $\widehat{\boldsymbol{\Sigma}}_n$ has a special form). \end{frame} % statclass first: see work file! \begin{frame} \frametitle{Example: The \texttt{statclass} data} %\framesubtitle{} Fifty-eight students in a Statistics class took 8 quizzes, a midterm test and a final exam. They also had 9 computer assignments. The instructor wants to compare average performance on the four components of the grade. \begin{itemize} \item How about a model? \item Should we assume normality? \item Does it make sense to assume quiz marks independent of final exam marks? \end{itemize} \end{frame} \begin{frame} \frametitle{Assume multivariate normality?} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{computerhistogram} \end{center} \end{frame} \begin{frame} \frametitle{A model for the \texttt{statclass} data} %\framesubtitle{} Fifty-eight students in a Statistics class took 8 quizzes, a midterm test and a final exam. They also had 9 computer assignments. \vspace{10mm} Let $\mathbf{Y}_1, \ldots, \mathbf{Y}_n$ be a random sample from an unknown distribution with mean $\mathbf{\mu} = (\mu_1, \mu_2, \mu_3, \mu_4)^\prime$ and covariance matrix $\mathbf{\Sigma}$. \vspace{10mm} $H_0: \mu_1=\mu_2=\mu_3=\mu_4$ \end{frame} \begin{frame} \frametitle{Applying $W_n = n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right)$} \framesubtitle{To test $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$} \begin{itemize} \item Test is based on $\sqrt{n}\left(\mathbf{T}_n - \boldsymbol{\theta} \right) \stackrel{d}{\rightarrow} \mathbf{T} \sim N\left(\mathbf{0},\boldsymbol{\Sigma} \right)$ \item CLT says $\sqrt{n}\left(\overline{\mathbf{Y}}_n - \boldsymbol{\mu} \right) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N\left(\mathbf{0},\boldsymbol{\Sigma} \right)$ \item So $\mathbf{T}_n=\overline{\mathbf{Y}}_n$ and $\boldsymbol{\theta}=\boldsymbol{\mu}$. \item Sample variance-covariance matrix is good enough for $\widehat{\boldsymbol{\Sigma}}_n \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}$ \item Write $H_0: \mu_1=\mu_2=\mu_3=\mu_4$ as $\mathbf{L}\boldsymbol{\mu}=\mathbf{h}$ \end{itemize} \end{frame} \begin{frame} \frametitle{$H_0: \mathbf{L}\boldsymbol{\theta}=\mathbf{h}$} \framesubtitle{To test equality of four means} \begin{displaymath} \begin{array}{cccc} \mathbf{L} & \boldsymbol{\mu} & = & \mathbf{0} \\ \left( \begin{array}{rrrr} 1 & -1 & 0 & 0 \\ 0 & 1 & -1 & 0 \\ 0 & 0 & 1 & -1 \\ \end{array} \right) & \left( \begin{array}{c} \mu_1 \\ \mu_2 \\ \mu_3 \\ \mu_4 \end{array} \right) & = & \left( \begin{array}{c} 0 \\ 0 \\ 0 \end{array} \right) \end{array} \end{displaymath} \end{frame} \begin{frame}[fragile] \frametitle{Read the data} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{statclass} \end{center} {\scriptsize \begin{verbatim} > statclass = read.table("http://www.utstat.toronto.edu/~brunner/appliedf13/code_n_data/lecture/statclass.data",header=T) > head(statclass) S E Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 C1 C2 C3 C4 C5 C6 C7 C8 C9 MT F 1 1 2 9 1 7 8 4 3 5 2 6 10 10 10 5 0 0 0 0 55 43 2 0 2 10 10 5 9 10 8 6 8 10 10 8 9 9 9 9 10 10 66 79 3 1 2 10 10 5 10 10 10 9 8 10 10 10 10 10 10 9 10 10 94 67 4 1 2 10 10 8 9 10 7 10 9 10 10 10 9 10 10 9 10 10 81 65 5 1 1 10 6 7 9 8 8 5 7 10 9 10 9 5 6 4 8 10 57 52 6 0 1 10 9 5 8 9 8 5 6 8 7 5 6 10 6 5 9 9 77 64 > attach(statclass) The following object(s) are masked from 'package:base': F \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Process the data a bit} %\framesubtitle{} {\scriptsize \begin{verbatim} > quiz = 10 * (Q1+Q2+Q3+Q4+Q5+Q6+Q7+Q8)/8 > computer = 10 * (C1+C2+C3+C4+C5+C6+C7+C8+C9)/9 > midterm = MT > final = F > datta = cbind(quiz,computer,midterm,final); head(round(datta)) quiz computer midterm final [1,] 49 46 55 43 [2,] 82 93 66 79 [3,] 90 99 94 67 [4,] 91 98 81 65 [5,] 75 79 57 52 [6,] 75 72 77 64 > ybar = apply(datta,2,mean); ybar quiz computer midterm final 72.58621 83.98467 68.87931 49.44828 > sigmahat = var(datta); sigmahat quiz computer midterm final quiz 120.66130 62.369765 60.31760 71.736993 computer 62.36977 134.894281 27.68233 6.272098 midterm 60.31760 27.682328 223.37114 99.633999 final 71.73699 6.272098 99.63400 272.777979 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Calculate $W_n = n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right)$} %\framesubtitle{} {\footnotesize % or scriptsize \begin{verbatim} > L = rbind(c(1,-1,0,0), + c(0,1,-1,0), + c(0,0,1,-1) ) > n = length(quiz); n [1] 58 > Wn = n * t(L %*% ybar) %*% solve(L%*%sigmahat%*%t(L)) %*% L%*%ybar > Wn [,1] [1,] 176.8238 > Wn = as.numeric(Wn) > pvalue = 1-pchisq(Wn,df=3); pvalue [1] 0 \end{verbatim} } % End size Conclude that the four means are not all equal. Which ones are different from one another? Need follow-up tests. \end{frame} \begin{frame}[fragile] \frametitle{The R function \texttt{Wtest}} \framesubtitle{``Estimated" asymptotic covariance matrix $\widehat{\mathbf{V}}_n = \frac{1}{n}\widehat{\boldsymbol{\Sigma}}_n$} \begin{eqnarray*} W_n &=& n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \\ &=& \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\mathbf{V}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \end{eqnarray*} {\scriptsize % This is the 2013 version: Used to be called WaldTest. \begin{verbatim} Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h # Note Vn is the estimated asymptotic covariance matrix of Tn, # so it's Sigma-hat divided by n. For Wald tests based on numerical # MLEs, Tn = theta-hat, and Vn is the inverse of the Hessian. { Wtest = numeric(3) names(Wtest) = c("W","df","p-value") r = dim(L)[1] W = t(L%*%Tn-h) %*% solve(L%*%Vn%*%t(L)) %*% (L%*%Tn-h) W = as.numeric(W) pval = 1-pchisq(W,r) Wtest[1] = W; Wtest[2] = r; Wtest[3] = pval Wtest } # End function Wtest \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Illustrate the \texttt{Wtest} function} For $H_0: \mu_1=\mu_2=\mu_3=\mu_4$, got $W_n=176.8238,df=3,p\approx 0$. {\scriptsize \begin{verbatim} > V = sigmahat / length(final) > LL = rbind( c(1,-1,0,0), + c(0,1,-1,0), + c(0,0,1,-1) ) > Wtest(LL,ybar,V) W df p-value 176.8238 3.0000 0.0000 > > ybar quiz computer midterm final 72.58621 83.98467 68.87931 49.44828 \end{verbatim} } % End size Is average quiz score different from midterm? {\scriptsize \begin{verbatim} > L1 = rbind(c(1,0,-1,0)); n = length(final) > Wtest(L=L1,Tn=ybar,Vn=sigmahat/n) W df p-value 3.56755878 1.00000000 0.05891887 \end{verbatim} } % End size Proceed to test all other pairwise differences between means. \end{frame} \section{Multinomial model} \begin{frame} \frametitle{The multinomial model} \begin{itemize} \item A good source of examples \item Also directly useful in applications \end{itemize} \end{frame} \begin{frame} \frametitle{Multinomial coefficient} \framesubtitle{For $c$ categories} From $n$ objects, number of ways to choose \begin{itemize} \item $n_1$ of type 1 \item $n_2$ of type 2 \\ ~~~~~~~$\vdots$ \item $n_c$ of type $c$ \end{itemize} \begin{displaymath} \binom{n}{n_1~\cdots~n_c} = \frac{n!}{n_1!~\cdots~n_c!} \end{displaymath} \end{frame} \begin{frame} \frametitle{Example of a multinomial coefficient} \framesubtitle{A counting problem} % Optional sub-title Of 30 graduating students who are seeking employment, how many ways are there for 15 to be employed in a job related to their field of study, 10 to be employed in a job unrelated to their field of study, and 5 unemployed? \begin{displaymath} \binom{30}{15~10~5} = 465,817,912,560 \end{displaymath} \end{frame} % Sage: factorial(30)/(factorial(5)*factorial(25)) %\subsection{Multinomial Distribution} %%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Multinomial Distribution} \framesubtitle{Denote by $M(n,\boldsymbol{\theta})$, where $\boldsymbol{\theta} = (\theta_1, \ldots, \theta_c)$} \begin{itemize} \item Statistical experiment with $c$ outcomes \item Repeated independently $n$ times \item $Pr(\mbox{Outcome }j) = \theta_j$, $j = 1, \ldots, c$ \item Number of times outcome $j$ occurs is $n_j$, $j = 1, \ldots, c$ \item An integer-valued \emph{multivariate} distribution \end{itemize} \begin{displaymath} P(n_1, \ldots, n_c) = \binom{n}{n_1~\cdots~n_c} \theta_1^{n_1} \cdots \theta_c^{n_c}, \end{displaymath} \vspace{5mm} where $0 \leq n_j \leq n$, $\displaystyle{\sum_{j=1}^c n_j=n}$, $0 < \theta_j < 1$, and $\displaystyle{\sum_{j=1}^c \theta_j=1}$. \end{frame} \begin{frame} \frametitle{Example} \framesubtitle{Recent college graduates looking for a job} \begin{itemize} \item Probability of job related to field of study = 0.60 \item Probability of job unrelated to field of study = 0.30 \item Probability of no job = 0.10 \end{itemize} Of 30 randomly chosen students, what is probability that 15 are employed in a job related to their field of study, 10 are employed in a job unrelated to their field of study, and 5 are unemployed? \begin{displaymath} \binom{30}{15~10~5} \, 0.60^{15} 0.30^{10} 0.10^5 = \frac{4933527642332542053801}{381469726562500000000000} \approx 0.0129 \end{displaymath} % Sage: factorial(30)/(factorial(15)*factorial(10)*factorial(5)) * ((6/10)^(15)*(3/10)^(10)*(1/10)^5) \end{frame} \begin{frame}[fragile] \frametitle{Calculating multinomial probabilities with R} \begin{displaymath} \binom{30}{15~10~5} \, 0.60^{15} 0.30^{10} 0.10^5 = \frac{4933527642332542053801}{381469726562500000000000} \approx 0.0129 \end{displaymath} \vspace{10mm} \begin{verbatim} > dmultinom(c(15,10,5), prob=c(.6, .3, .1)) [1] 0.01293295 \end{verbatim} \end{frame} \begin{frame} \frametitle{There are actually $c-1$ variables and $c-1$ parameters} \framesubtitle{In the multinomial with $c$ categories} \begin{eqnarray*} P(n_1, \ldots, n_{c-1}) &=& \frac{n!}{n_1!~\cdots~n_{c-1}!(n-\sum_{j=1}^{c-1}n_j)!} \\ & & ~~~~\times~ \theta_1^{n_1} \cdots \theta_{c-1}^{n_{c-1}}(1-\sum_{j=1}^{c-1}\theta_j)^{n-\sum_{j=1}^{c-1}n_j} \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Useful fact} \framesubtitle{Proof omitted} If you combine the categories of a multinomial (adding the frequencies), the result is still multinomial (adding the probabilities). \begin{itemize} \item Roll a die 100, times, observe the events 1, 3, 5, even. \item Collapse into just 2 categories, and the result is binomial. \end{itemize} \end{frame} %\subsection{Estimation} %%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Hypothetical data file} \framesubtitle{Let $Y_{i,j}$ be indicators for category membership, $i=1, \ldots, n$ and $j=1, \ldots, c$} \begin{center} \begin{tabular}{|c|c||c|c|c|} \hline Case & Job & $Y_1$ & $Y_2$ & $Y_3$ \\ \hline \hline 1 & 1 & 1 & 0 & 0 \\ \hline 2 & 3 & 0 & 0 & 1 \\ \hline 3 & 2 & 0 & 1 & 0 \\ \hline 4 & 1 & 1 & 0 & 0 \\ \hline $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ \\ \hline $n$ & 2 & 0 & 1 & 0 \\ \hline \hline Total & & $\sum_{i=1}^ny_{i,1}$ & $\sum_{i=1}^ny_{i,2}$ & $\sum_{i=1}^ny_{i,3}$ \\ \hline \end{tabular} \end{center} Note that \begin{itemize} \item A real data file will almost never have the redundant variables $Y_1$, $Y_2$ and $Y_3$. %\item $\sum_{j=1}^cy_{i,j}=1$ \item $\sum_{i=1}^ny_{i,j}=n_j$ \end{itemize} \end{frame} \begin{frame} \frametitle{Lessons from the data file} %\framesubtitle{} \begin{itemize} \item Cases ($n$ of them) are independent $M(1,\boldsymbol{\theta})$, so $E(Y_{i,j}) = \theta_j$. \item Column totals $n_j$ count the number of times each category occurs: Joint distribution is $M(n,\boldsymbol{\theta})$ \item If you make a frequency table (frequency distribution) \begin{itemize} \item The $n_j$ counts are the cell frequencies! \item They are random variables, and now we know their joint distribution. \item Each individual (marginal) table frequency is $B(n,\theta_j)$. \item Expected value of cell frequency $j$ is $E(n_j) = n\theta_j$ \end{itemize} \item Tables of 2 and or more dimensions present no problems; form combination variables. \end{itemize} \end{frame} \begin{frame} \frametitle{Example of a frequency table} \framesubtitle{For the Jobs data} \begin{center} \begin{tabular}{lrr} \hline Job Category & Frequency & Percent \\ \hline Employed in field & 106 & 53 \\ Employed outside field & 74 & 37 \\ Unemployed & 20 & 10 \\ \hline Total & 200 & 100.0 \\ \hline \end{tabular} \end{center} \end{frame} \begin{frame} \frametitle{Estimation based on $E(Y_{i,j}) = \theta_j$} \begin{displaymath} E(\overline{\mathbf{Y}}) = E \left( \begin{array}{c} \frac{1}{n}\sum_{i=1}^nY_{i,1} \\ \\ \frac{1}{n}\sum_{i=1}^nY_{i,2} \\ \vdots \\ \frac{1}{n}\sum_{i=1}^nY_{i,c} \end{array} \right) = E \left( \begin{array}{c} \overline{Y}_1 \\ \\ \overline{Y}_2 \\ \vdots \\ \overline{Y}_c \end{array} \right) = \left( \begin{array}{c} \theta_1 \\ \\ \theta_2 \\ \vdots \\ \theta_c \end{array} \right) \end{displaymath} \vspace{5mm} The sample mean vector is \begin{itemize} \item A vector of sample proportions. \item Unbiased for $\boldsymbol{\theta}$. \item Consistent for $\boldsymbol{\theta}$ because by the Law of Large Numbers, \end{itemize} {\LARGE \begin{displaymath} \overline{\mathbf{Y}}_n \stackrel{a.s.}{\rightarrow} \boldsymbol{\theta} \end{displaymath} } \end{frame} \section{Central Limit Theorem} \begin{frame} \frametitle{Multivariate Central Limit Theorem} To avoid a singular covariance matrix, let \begin{displaymath} \boldsymbol{\theta} = \left( \begin{array}{c} \theta_1 \\ \vdots \\ \theta_{c-1} \end{array} \right) \mbox{~~~~and~~~~} \overline{\mathbf{Y}}_n = \left( \begin{array}{c} \overline{Y}_{1~~} \\ \vdots \\ \overline{Y}_{c-1} \end{array} \right) \end{displaymath} \vspace{5mm} Multivariate Central Limit Theorem says let $\mathbf{Y}_1, \ldots, \mathbf{Y}_n$ be a random sample from a distribution with expected value vector $\boldsymbol{\mu}$ and covariance matrix $\boldsymbol{\Sigma}$. Then {\LARGE \begin{displaymath} \sqrt{n}(\overline{\mathbf{Y}}_n-\boldsymbol{\mu}) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N\left(\mathbf{0}, \boldsymbol{\Sigma} \right). \end{displaymath} } \end{frame} \begin{frame} \frametitle{Asymptotically speaking} \framesubtitle{That is, approximately for large $n$} $\overline{\mathbf{Y}}_n$ is normal with \begin{itemize} \item Mean $\boldsymbol{\mu}$ and \item Covariance matrix $\frac{1}{n}\boldsymbol{\Sigma}$. \end{itemize} \end{frame} \begin{frame} \frametitle{$\sqrt{n}(\overline{\mathbf{Y}}_n-\boldsymbol{\mu}) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N\left(\mathbf{0}, \boldsymbol{\Sigma} \right)$} %\framesubtitle{} For multinomial data, \begin{itemize} \item Have $\boldsymbol{\mu} = \boldsymbol{\theta}$ \item Because $\mathbf{Y}_i \sim M(1,\boldsymbol{\theta})$, the diagonal elements of $\boldsymbol{\Sigma}$ are $Var(Y_{i,j}) = \theta_j(1-\theta_j)$. \item What about the off-diagonal elements? \end{itemize} \begin{eqnarray*} Cov(Y_{i,j},Y_{i,k}) & = & E(Y_{i,j}Y_{i,k}) - E(Y_{i,j}) E(Y_{i,k}) \\ & = & ~~~~~ 0 ~~~~~~~ - ~~~~\theta_j~~~~~~\theta_k \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Covariance matrix for $c=4$ categories} %\framesubtitle{} {\LARGE \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ccc} \theta_1(1-\theta_1) & -\theta_1\theta_2 & -\theta_1\theta_3 \\ -\theta_1\theta_2 & \theta_2(1-\theta_2) & -\theta_2\theta_3 \\ -\theta_1\theta_3 & -\theta_2\theta_3 & \theta_3(1-\theta_3) \end{array} \right) \end{displaymath} } \begin{itemize} \item Last category is dropped because it's redundant. \item Consistent estimator of $\boldsymbol{\Sigma}$ is easy. \item All you need is the frequency table. \end{itemize} \end{frame} \section{Applications} \begin{frame} \frametitle{A testing problem} %\framesubtitle{} Administrators at a vocational college recognize that the percentage of students who are unemployed after graduation will vary depending upon economic conditions, but they claim that still, about twice as many students will be employed in a job related to their field of study, compared to those who get an unrelated job. To test this hypothesis, they select a random sample of 200 students from the most recent class, and observe 106 employed in a job related to their field of study, 74 employed in a job unrelated to their field of study, and 20 unemployed. Test the hypothesis using a large-sample chi-squared test, being guided by the usual 0.05 significance level. State your conclusions in symbols and words. \end{frame} \begin{frame} \frametitle{Some questions to help guide us through the problem} \begin{itemize} \item What is the model? \begin{displaymath} \mathbf{Y}_1, \ldots, \mathbf{Y}_n \stackrel{i.i.d.}{\sim} M(1,(\theta_1,\theta_2,\theta_3)) \end{displaymath} \item What is the null hypothesis, in symbols? \begin{displaymath} H_0: \theta_1=2\theta_2 \end{displaymath} \item Give the null hypothesis in matrix form as $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$ \begin{displaymath} (1, -2) \left( \begin{array}{c} \theta_1 \\ \theta_2 \end{array} \right) = (0) \end{displaymath} Last category is dropped because it's redundant. \end{itemize} \end{frame} \begin{frame} \frametitle{Test statistic} \framesubtitle{For the Wald-like test} Suppose \begin{itemize} \item $\sqrt{n}\left(\mathbf{T}_n - \boldsymbol{\theta} \right) \stackrel{d}{\rightarrow} \mathbf{T} \sim N\left(\mathbf{0},\boldsymbol{\Sigma} \right)$ and \item $\widehat{\boldsymbol{\Sigma}}_n \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}$. \item $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$, where $\mathbf{L}$ is $r \times p$ of full row rank. \end{itemize} Then \begin{displaymath} W_n = n \left( \mathbf{LT}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{LT}_n - \mathbf{h} \right) \stackrel{d}{\rightarrow} W \sim \chi^2(r). \end{displaymath} \end{frame} \begin{frame}[fragile] \frametitle{Calculating the test statistic with R} \framesubtitle{Frequencies were in=106, out=74, un=20} {\scriptsize \begin{verbatim} > # Easy to modify for any multinomial problem. Omit one category. > freq = c(106,74,20); n = sum(freq) > ybar = (freq/n)[1:2] # Just the first 2 > p = length(ybar) > Sighat = matrix(nrow=p,ncol=p) # Empty matrix > for(i in 1:p) { for (j in 1:p) Sighat[i,j] = -ybar[i]*ybar[j] } > for(i in 1:p) Sighat[i,i] = ybar[i]*(1-ybar[i]) > Sighat [,1] [,2] [1,] 0.2491 -0.1961 [2,] -0.1961 0.2331 > > LL = rbind(c(1,-2)) > source("http://www.utstat.utoronto.ca/~brunner/appliedf13/code_n_data /lecture/Wtest.txt") > Wtest(LL,ybar,Sighat/n) W df p-value 4.48649474 1.00000000 0.03416366 \end{verbatim} } % End size Conclude more than predicted are employed outside their field of study. \end{frame} \begin{frame} \frametitle{Beer study} %\framesubtitle{} Under carefully controlled conditions, 120 beer drinkers each tasted 6 beers and indicated which one they liked best. Here are the numbers preferring each beer. \begin{center} \begin{tabular}{|l|c|c|c|c|c|c|} \hline & \multicolumn{6}{c|}{Preferred Beer} \\ \hline & 1 & 2 & 3 & 4 & 5 & 6 \\ \hline Frequency & 30 & 24 & 22 & 28 & 9 & 7 \\ \hline \end{tabular} \end{center} % x2 = rmultinom(1,size=120,prob=c(2,2,2,2,1,1)); x2 \begin{itemize} \item State a reasonable model. \end{itemize} \begin{displaymath} \mathbf{Y}_1, \ldots, \mathbf{Y}_n \stackrel{i.i.d.}{\sim} M(1,(\theta_1,\theta_2, \ldots, \theta_6)) \end{displaymath} \end{frame} \begin{frame} \frametitle{The first question} %\framesubtitle{} Are preferences for the 6 beers is different in the population from which this sample was taken? \begin{itemize} \item State the null hypothesis.\begin{displaymath} \theta_1=\theta_2=\theta_3=\theta_4=\theta_5=\frac{1}{6} \end{displaymath} \item Give the null hypothesis in matrix form as $H_0:\mathbf{L}\boldsymbol{\theta}=\mathbf{h}$ \begin{displaymath} \left( \begin{array}{c c c c c} 1 & 0 & 0 & 0 & 0 \\ 0 & 1 & 0 & 0 & 0 \\ 0 & 0 & 1 & 0 & 0 \\ 0 & 0 & 0 & 1 & 0 \\ 0 & 0 & 0 & 0 & 1 \end{array} \right) \left( \begin{array}{c} \theta_1 \\ \theta_2 \\ \theta_3 \\ \theta_4 \\ \theta_5 \end{array} \right) = \left( \begin{array}{c} 1/6 \\ 1/6 \\ 1/6 \\ 1/6 \\ 1/6 \end{array} \right) \end{displaymath} \item What are the degrees of freedom of the test? \begin{displaymath} df=5 \end{displaymath} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Test for equal probabilities} %\framesubtitle{} {\footnotesize \begin{verbatim} > # Estimated covariance matrix for a multinomial: > # Omit one category. > freq = c(30,24,22,28,9,7); n = sum(freq) > ybar = (freq/n)[1:5] # Omit lat category > p = length(ybar) > Sighat = matrix(nrow=p,ncol=p) # Empty matrix > for(i in 1:p) { for (j in 1:p) Sighat[i,j] = -ybar[i]*ybar[j] } > for(i in 1:p) Sighat[i,i] = ybar[i]*(1-ybar[i]) > > # L matrix is the 5x5 identity for this hypothesis > alleq = numeric(5) + 1/6; alleq [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 > test1 = Wtest(L=diag(5),Tn=ybar,Vn=Sighat/n,h=alleq); test1 W df p-value 4.405483e+01 5.000000e+00 2.257610e-08 > \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Compare Pearson chi-squared test for equal probabilities} \framesubtitle{$X^2 = \sum \frac{(\mbox{Observed}-\mbox{Expected})^2}{\mbox{Expected}}$} {\footnotesize % or scriptsize \begin{verbatim} > round(test1,3) W df p-value 44.055 5.000 0.000 > expected = numeric(6) + n/6 > rbind(freq,expected) [,1] [,2] [,3] [,4] [,5] [,6] freq 30 24 22 28 9 7 expected 20 20 20 20 20 20 > X2 = sum((freq-expected)^2/expected) > X2 [1] 23.7 > > 1-pchisq(X2,5) [1] 0.0002479085 \end{verbatim} } % End size \end{frame} \begin{frame} \frametitle{Take a look} \framesubtitle{Where is the lack of fit coming from?} \begin{center} \begin{tabular}{|l|r|r|r|r|r|r|} \hline & \multicolumn{6}{c|}{Preferred Beer} \\ \hline & 1~ & 2~ & 3~ & 4~ & 5~ & 6~ \\ \hline \hline Frequency & 30 & 24 & 22 & 28 & 9 & 7 \\ \hline Expected & 20 & 20 & 20 & 20 & 20 & 20 \\ \hline Residual & 10 & 4 & 2 & 8 & -11 & -13 \\ \hline St. Residual & 2.11 & 0.91 & 0.47 & 1.73 & -3.81 & -5.06 \\ \hline \end{tabular} \end{center} To standardize the a residual, divide it by the estimated standard deviation. %# R code - Make this HW %# Examine residuals %expect = numeric(6) + n/6 %resid = freq-expect %p = freq/n %se = sqrt(n*p*(1-p)) %stresid = resid/se %round(rbind(freq,expect,resid,stresid),2) \end{frame} \begin{frame} \frametitle{A follow-up question} %\framesubtitle{} It seems that the first 4 beers are lagers and the last two are ales. The client says no one would expect preference for lagers and ales to be the same. So let's test whether preference for the 4 lagers is different, and at the same time, whether preference for the 2 ales is different. \begin{itemize} \item Why not just look at the standardized residuals, which are $Z$-tests? \item Give the null hypothesis in symbols. \end{itemize} \begin{displaymath} \begin{array}{l} \theta_1=\theta_2=\theta_3=\theta_4 \\ \\ \theta_5 = 1-\theta_1-\theta_2-\theta_3-\theta_4-\theta_5 \end{array} \end{displaymath} \end{frame} \begin{frame} \frametitle{Null hypothesis in matrix form} \framesubtitle{For $\theta_1=\theta_2=\theta_3=\theta_4$ and $\theta_1+\theta_2+\theta_3+\theta_4+2\theta_5 = 1$} \begin{displaymath} \left( \begin{array}{rrrrr} 1 & -1 & 0 & 0 & 0 \\ 0 & 1 & -1 & 0 & 0 \\ 0 & 0 & 1 & -1 & 0 \\ 1 & 1 & 1 & 1 & 2 \end{array} \right) \left( \begin{array}{c} \theta_1 \\ \theta_2 \\ \theta_3 \\ \theta_4 \\ \theta_5 \end{array} \right) = \left( \begin{array}{c} 0 \\ 0 \\ 0 \\ 1 \end{array} \right) \end{displaymath} \vspace{10mm} There are infinitely many right answers. \end{frame} \begin{frame}[fragile] \frametitle{Test for differences among ales and differences between lagers, simultaneously} %\framesubtitle{} \begin{verbatim} > # Test for diff among ales and diff between lagers > L2 = rbind( c(1,-1, 0, 0, 0), + c(0, 1,-1, 0, 0), + c(0, 0, 1,-1, 0), + c(1, 1, 1, 1, 2) ) > h2 = cbind(c(0,0,0,1)) > Wtest(L2,ybar,Sighat/n,h2) W df p-value 1.8240899 4.0000000 0.7680721 \end{verbatim} % Likelihood ratio test p = 0.7735209 \end{frame} \begin{frame}[fragile] \frametitle{Test mean preference for ales vs. mean preference for lagers, just for completeness} The null hypothesis is \begin{eqnarray*} && \frac{1}{4} \left(\theta_1+\theta_2+\theta_3+\theta_4 \right) = \frac{1}{2} \left(\theta_5 + 1 - \sum_{j=1}^5 \theta_j \right) \\ & \Leftrightarrow & \theta_1+\theta_2+\theta_3+\theta_4 = \frac{2}{3} \end{eqnarray*} \begin{verbatim} > # Average ale vs. average lager > L3 = rbind(c(1,1,1,1,0)); h3=2/3 > Wtest(L3,ybar,Sighat/n,h3) W df p-value 4.153846e+01 1.000000e+00 1.155747e-10 \end{verbatim} \end{frame} \begin{frame} \frametitle{State the conclusion in plain language} Consumers tend to prefer ales over lagers. Differences between ales and differences between lagers are small enough to be attributed to chance. \end{frame} \begin{frame} \frametitle{Another application: Mean index numbers} In a study of consumers' opinions of 5 popular TV programmes, 240 consumers who watch all the shows at least once a month completed a computerized interview. On one of the screens, they indicated how much they enjoyed each programme by mouse-clicking on a 10cm line. One end of the line was labelled ``Like very much," and the other end was labelled ``Dislike very much." So each respondent contributed 5 ratings, on a continuous scale from zero to ten. \vspace{5mm} The study was commissioned by the producers of one of the shows, which will be called ``Programme $E$." Ratings of Programmes $A$ through $D$ were expressed as percentages of the rating for Programme $E$, and these were described as ``Liking indexed to programme $E$." \end{frame} \begin{frame} \frametitle{In statistical language} We have $X_{i,1}, \ldots X_{i,5}$ for $i = 1, \ldots, n$, and we calculate \begin{displaymath} Y_{i,j} = 100 \frac{X_{i,j}}{X_{i,5}} \end{displaymath} \begin{itemize} \item We want confidence intervals for the 4 mean index numbers, and tests of differences between means. \item Observations from the same respondent are definitely not independent. \item What is the distribution? \item What is a reasonable model? \end{itemize} \end{frame} \begin{frame} \frametitle{Model} %\framesubtitle{} Let $\mathbf{Y}_1, \ldots, \mathbf{Y}_n$ be a random sample from an unknown multivariate distribution $F$ with expected value $\boldsymbol{\mu}$ and covariance matrix $\boldsymbol{\Sigma}$. \vspace{10mm} One way to think about it is \begin{itemize} \item The parameter is the unknown distribution $F$. \item The parameter space is a space of distribution functions. \item $\boldsymbol{\mu}$ and $\boldsymbol{\Sigma}$ are \emph{functions} of $F$. \item We're only interested in $\boldsymbol{\mu}$. \end{itemize} \end{frame} \begin{frame} \frametitle{We have the tools we need} %\framesubtitle{} \begin{itemize} \item $\sqrt{n}(\overline{\mathbf{Y}}_n-\boldsymbol{\mu}) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N\left(\mathbf{0}, \boldsymbol{\Sigma} \right)$ and \item For $\widehat{\boldsymbol{\Sigma}}_n \stackrel{p}{\rightarrow} \boldsymbol{\Sigma}$, use the sample covariance matrix. \item $H_0:\mathbf{L}\boldsymbol{\mu}=\mathbf{h}$ \end{itemize} \begin{displaymath} W_n = n \left( \mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\boldsymbol{\Sigma}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right) \end{displaymath} \end{frame} \begin{frame}[fragile] \frametitle{Read the data} %\framesubtitle{} {\tiny \begin{verbatim} > Y = read.table("http://www.utstat.toronto.edu/~brunner/appliedf13/code_n_data/lecture/TVshows.data") \end{verbatim} } % End size \begin{verbatim} > Y[1:4,] A B C D 1 101.3 81.0 101.8 89.6 2 94.0 85.3 76.3 100.8 3 145.4 138.7 151.0 148.3 4 72.0 86.1 96.1 96.3 > n = dim(Y)[1]; n [1] 240 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Confidence intervals} %\framesubtitle{} {\footnotesize \begin{verbatim} > ave = apply(Y,2,mean); ave A B C D 101.65958 98.50167 99.39958 103.94167 > v = apply(Y,2,var) # Sample variances with n-1 > stderr = sqrt(v/n) > me95 = 1.96*stderr > lower95 = ave-me95 > upper95 = ave+me95 > Z = (ave-100)/stderr > rbind(ave,marginerror95,lower95,upper95,Z) A B C D ave 101.659583 98.501667 99.3995833 103.941667 marginerror95 1.585652 1.876299 1.7463047 1.469928 lower95 100.073931 96.625368 97.6532786 102.471739 upper95 103.245236 100.377966 101.1458880 105.411594 Z 2.051385 -1.565173 -0.6738897 5.255814 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{What if we ``assume" normality?} %\framesubtitle{} {\footnotesize \begin{verbatim} > rbind(ave,lower95,upper95,Z) A B C D ave 101.659583 98.501667 99.3995833 103.941667 lower95 100.073931 96.625368 97.6532786 102.471739 upper95 103.245236 100.377966 101.1458880 105.411594 Z 2.051385 -1.565173 -0.6738897 5.255814 > attach(Y) # So A, B, C, D are available > t.test(A,mu=100) One Sample t-test data: A t = 2.0514, df = 239, p-value = 0.04132 alternative hypothesis: true mean is not equal to 100 95 percent confidence interval: 100.0659 103.2533 sample estimates: mean of x 101.6596 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Test equality of means} %\framesubtitle{} {\footnotesize \begin{verbatim} > S = var(Y); S A B C D A 157.0779 110.77831 106.56220 109.6234 B 110.7783 219.93950 95.66686 100.3585 C 106.5622 95.66686 190.51937 106.2501 D 109.6234 100.35851 106.25006 134.9867 > cor(Y) A B C D A 1.0000000 0.5959991 0.6159934 0.7528355 B 0.5959991 1.0000000 0.4673480 0.5824479 C 0.6159934 0.4673480 1.0000000 0.6625431 D 0.7528355 0.5824479 0.6625431 1.0000000 > > L4 = rbind( c(1,-1, 0, 0), + c(0, 1,-1, 0), + c(0, 0, 1,-1) ) > Wtest(L=L4,Tn=ave,Vn=S/n) W df p-value 7.648689e+01 3.000000e+00 2.220446e-16 \end{verbatim} } % End size \end{frame} % \subsection{Between cases} \begin{frame} \frametitle{Between vs. within cases} %\framesubtitle{} We have been comparing means. The means represent the average of what happens under different conditions. \begin{itemize} \item {Within cases}: A case (individual, sampling unit) contributes data in all the conditions. \item {Between cases}: Different cases in different conditions. \item Think of a one-sample vs. 2-sample $t$-test \item All the examples so far have been within cases. \end{itemize} \end{frame} \begin{frame} \frametitle{Independent groups (Between cases)} \framesubtitle{Like a one-factor ANOVA} \begin{itemize} \item Have $n$ cases, separated into $p$ groups: Maybe occupation of main wage earner in family. \item $n_1 + n_2 + \cdots + n_p = n$ \item Response variable is either binary or quantity of something, like annual energy consumption. \item No reason to believe normality. \item No reason to believe equal variances. \item $H_0: \mathbf{L} \boldsymbol{\mu} = \mathbf{h}$ \item For example, $H_0: \mu_1 = \ldots = \mu_p$ \item Or $\mu_2 = \mu_7$ \end{itemize} \end{frame} \begin{frame} \frametitle{Basic Idea} %\framesubtitle{} The $p$ sample means are independent random variables. Asymptotically, \begin{itemize} \item $\overline{Y}_j \sim N(\mu_j,\frac{\sigma^2_j}{n_j})$ \item The $p \times 1$ random vector $\overline{\mathbf{Y}}_n \sim N(\boldsymbol{\mu},\mathbf{V}_n)$, \item Where $\mathbf{V}_n$ is a $p \times p$ diagonal matrix with $j$th diagonal element $\frac{\sigma^2_j}{n_j}$. \item $\mathbf{L}\overline{\mathbf{Y}}_n \sim N_r(\mathbf{L}\boldsymbol{\mu},\mathbf{LV}_n\mathbf{L}^\prime)$ \item ``Estimate" $\mathbf{V}_n$ with the diagonal matrix $\widehat{\mathbf{V}}_n$, $j$th diagonal element $\frac{\widehat{\sigma}^2_j}{n_j}$ \item And if $H_0: \mathbf{L}\boldsymbol{\mu} = \mathbf{h}$ is true, \end{itemize} \begin{displaymath} W_n = \left( \mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\mathbf{V}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right) \sim \chi^2(r) % 24 \end{displaymath} \end{frame} \begin{frame} \frametitle{One little technical issue} %\framesubtitle{} \begin{itemize} \item More than one $n_j$ is going to infinity. \item The rates at which they go to infinity can't be too different. \item In particular, if $n = n_1 + n_2 + \cdots + n_p$, \item Then each $\frac{n_j}{n}$ must converge to a non-zero constant (in probability). \end{itemize} \end{frame} \begin{frame} \frametitle{Compare High School marks for students at 3 campuses} %\framesubtitle{} \begin{center} \begin{tabular}{lccc} Campus & $n$ & Mean & Standard Deviation \\ \hline SG & 3906 & 84.94 & 5.59 \\ UTM & 1583 & 79.68 & 5.82 \\ UTSC & 1849 & 79.96 & 5.98 \\ \hline \end{tabular} \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Compute $W_n = \left( \mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right)^\prime \left(\mathbf{L}\widehat{\mathbf{V}}_n \mathbf{L}^\prime \right)^{-1} \left(\mathbf{L}\overline{\mathbf{Y}}_n - \mathbf{h} \right)$} \framesubtitle{$H_0:\mu_1=\mu_2=\mu_3$} \begin{center} {\tiny \begin{tabular}{lccc} Campus & $n$ & Mean & Standard Deviation \\ \hline SG & 3906 & 84.94 & 5.59 \\ UTM & 1583 & 79.68 & 5.82 \\ UTSC & 1849 & 79.96 & 5.98 \\ \hline \end{tabular} } % End size \end{center} {\tiny \begin{verbatim} > source("http://www.utstat.utoronto.ca/~brunner/appliedf13/code_n_data/lecture/Wtest.txt") \end{verbatim} } % End size {\footnotesize % or scriptsize \begin{verbatim} > n = c(3906,1583,1849) > ybar = c(84.94,79.68,79.96) > Vhat = diag(c(5.59,5.82,5.98)^2/n); Vhat [,1] [,2] [,3] [1,] 0.008000026 0.0000000 0.0000000 [2,] 0.000000000 0.0213976 0.0000000 [3,] 0.000000000 0.0000000 0.0193404 > L1 = rbind(c(1,-1,0), + c(0,1,-1) ) > Wtest(L1,ybar,Vhat) W df p-value 1441.58 2.00 0.00 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Test difference between UTM and UTSC} %\framesubtitle{} {\tiny \begin{tabular}{lccc} Campus & $n$ & Mean & Standard Deviation \\ \hline SG & 3906 & 84.94 & 5.59 \\ UTM & 1583 & 79.68 & 5.82 \\ UTSC & 1849 & 79.96 & 5.98 \\ \hline \end{tabular} } % End size \vspace{10mm} {\footnotesize % or scriptsize \begin{verbatim} > # UTM vs. UTSC > Wtest(rbind(c(0,1,-1)),ybar,Vhat) W df p-value 1.9244931 1.0000000 0.1653622 \end{verbatim} } % End size \end{frame} \section{Delta Method} \begin{frame} \frametitle{The Delta Method} If a sequence of random vectors converges to something nice in distribution, the Delta Method helps with the convergence of \emph{functions} of those random vectors. \end{frame} \begin{frame} \frametitle{The Jacobian} %\framesubtitle{} \begin{itemize} \item Univariate version of the Delta method says $\sqrt{n}\left( g(T_n)- g(\theta) \right) \stackrel{d}{\rightarrow} g^\prime(\theta) \, T $ \item In the multivariate version, the derivative is replaced by a matrix of partial derivatives, with $i,j$ element $\frac{\partial g_i}{\partial x_j}$. \item Say the function $g$ maps $\mathbb{R}^3$ into $\mathbb{R}^2$. Then \begin{displaymath} \mbox{\.{g}}(\boldsymbol{\theta}) = \left( \begin{array}{ccc} \frac{\partial }{\partial \theta_1}g_1(\boldsymbol{\theta}) & \frac{\partial }{\partial \theta_2}g_1(\boldsymbol{\theta}) & \frac{\partial }{\partial \theta_3}g_1(\boldsymbol{\theta}) \\ \\ \frac{\partial }{\partial \theta_1}g_2(\boldsymbol{\theta}) & \frac{\partial }{\partial \theta_2}g_2(\boldsymbol{\theta}) & \frac{\partial }{\partial \theta_3}g_2(\boldsymbol{\theta}) \end{array} \right) \end{displaymath} where $\boldsymbol{\theta}=(\theta_1,\theta_2,\theta_3)^\prime$. \item It's a Jacobian. \end{itemize} \end{frame} \begin{frame} \frametitle{The Multivariate Delta Method} Let $g: \mathbb{R}^d \rightarrow \mathbb{R}^k$ be such that the elements of \.{g}$(\mathbf{x}) = \left[ \frac{\partial g_i}{\partial x_j} \right]_{k \times d}$ are continuous in a neighborhood of $\boldsymbol{\theta} \in \mathbb{R}^d$. \begin{itemize} \item If $\mathbf{T}_n$ is a sequence of $d$-dimensional random vectors such that $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T}$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mbox{\.{g}} (\boldsymbol{\theta}) \mathbf{T}$. \item In particular, if $\sqrt{n}(\mathbf{T}_n-\boldsymbol{\theta}) \stackrel{d}{\rightarrow} \mathbf{T} \sim N(\mathbf{0},\mathbf{\Sigma})$, then $\sqrt{n}(g(\mathbf{T}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\theta})\mathbf{\Sigma}\mbox{\.{g}}(\boldsymbol{\theta}) ^\prime)$. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Example: The Japanese car study} In a study commissioned by the Honda motor company\footnote{Not really. All the numbers are made up, based loosely on figures I found on the Internet.}, a random sample of 1200 Canadians who purchased a new Japanese car within the past 12 months gave the make of the most recent one they bought. Company executives like to look at percent differences, compared to choice of their own brand. {\scriptsize \begin{verbatim} > car = c(316, 414, 138, 254, 28, 50) > names(car) = c("Honda", "Toyota", "Nissan", "Mazda", "Mitsubishi", "Suburu") > perdiff = round(100 * (car-car[1])/car[1],1) > car Honda Toyota Nissan Mazda Mitsubishi Suburu 316 414 138 254 28 50 > perdiff Honda Toyota Nissan Mazda Mitsubishi Suburu 0.0 31.0 -56.3 -19.6 -91.1 -84.2 \end{verbatim} } % End size \end{frame} \begin{frame} \frametitle{Multinomial model with 6 categories} %\framesubtitle{} \begin{itemize} % \item It's a multinomial model with 6 categories. \item Make Honda the ``other" category, for symmetry. \item Parameters are $\boldsymbol{\theta} = (\theta_1, \ldots, \theta_5)^\prime$. \item For compactness, let $\theta_0=1-\sum_{k=1}^5 \theta_k$, the probability of buying a Honda. \item Estimate $\boldsymbol{\theta}$ with the vector of sample proportions $\overline{\mathbf{Y}} = (\overline{Y}_1, \ldots, \overline{Y}_5)^\prime$. \item The \emph{functions} of $\boldsymbol{\theta}$ that interest us are the population percent differences: \begin{displaymath} g_i(\boldsymbol{\theta}) = 100\left(\frac{\theta_i-\theta_0}{\theta_0}\right) = 100\left(\frac{\theta_i}{\theta_0}-1\right) \end{displaymath} for $i=1, \ldots, 5$. \end{itemize} \end{frame} \begin{frame} \frametitle{The function $g(\boldsymbol{\theta})$} \framesubtitle{Written a better way} \begin{displaymath} g(\boldsymbol{\theta}) = \left( \begin{array}{c} g_1(\boldsymbol{\theta}) \\ \\ g_2(\boldsymbol{\theta}) \\ \\ g_3(\boldsymbol{\theta}) \\ \\ g_4(\boldsymbol{\theta}) \\ \\ g_5(\boldsymbol{\theta}) \end{array} \right) = \left( \begin{array}{c} 100\left(\frac{\theta_1}{\theta_0}-1\right) \\ \\ 100\left(\frac{\theta_2}{\theta_0}-1\right) \\ \\ 100\left(\frac{\theta_3}{\theta_0}-1\right) \\ \\ 100\left(\frac{\theta_4}{\theta_0}-1\right) \\ \\ 100\left(\frac{\theta_5}{\theta_0}-1\right) \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{The Jacobian \.{g}$(\boldsymbol{\theta})$ is a $5 \times 5$ matrix of partial derivatives.} %\framesubtitle{} That's only because $g: \mathbb{R}^5 \rightarrow \mathbb{R}^5$. Usually \.{g}$(\boldsymbol{\theta})$ has more columns than rows. \vspace{10mm} \vspace{10mm} Using $\theta_0=1-\sum_{k=1}^5 \theta_k$ and noting $\frac{\partial \theta_0}{\partial \theta_j} = -1$, \begin{displaymath} \frac{\partial g_i}{\partial \theta_j} = \frac{\partial}{\partial \theta_j} 100\left(\frac{\theta_i}{\theta_0}-1\right) = \left\{ \begin{array}{ll} 100 \frac{\theta_i+\theta_0}{\theta_0^2} & \mbox{ if } i = j \\ &\\ 100 \frac{\theta_i}{\theta_0^2} & \mbox{ if } i \neq j \end{array} \right. % Need that crazy invisible right period. \end{displaymath} \end{frame} \begin{frame} \frametitle{The Jacobian} \begin{displaymath} \mbox{\.{g}}(\boldsymbol{\theta}) = \frac{100}{\theta_0^2} \left( \begin{array}{ccccc} \theta_1+\theta_0 & \theta_1 & \theta_1 & \theta_1 & \theta_1 \\ \theta_2 & \theta_2+\theta_0 & \theta_2 & \theta_2 & \theta_2 \\ \theta_3 & \theta_3 & \theta_3+\theta_0 & \theta_3 & \theta_3 \\ \theta_4 & \theta_4 & \theta_4 & \theta_4+\theta_0 & \theta_4 \\ \theta_5 & \theta_5 & \theta_5 & \theta_5 & \theta_5+\theta_0 \end{array} \right) \end{displaymath} \end{frame} \begin{frame} \frametitle{The asymptotic covariance matrix} %\framesubtitle{Of the vector of percent differences} The Delta Method says \begin{displaymath} \sqrt{n}(g(\overline{\mathbf{Y}}_n)-g(\boldsymbol{\theta})) \stackrel{d}{\rightarrow} \mathbf{Y} \sim N(\mathbf{0}, \mbox{\.{g}}(\boldsymbol{\theta})\mathbf{\Sigma}(\boldsymbol{\theta})\mbox{\.{g}}(\boldsymbol{\theta}) ^\prime) \end{displaymath} So the asymptotic covariance matrix of $g(\overline{\mathbf{Y}}_n)$ (the percent differences from Honda) is $\frac{1}{n}$\.g$(\boldsymbol{\theta})\mathbf{\Sigma}(\boldsymbol{\theta})\mbox{\.{g}}(\boldsymbol{\theta}) ^\prime$, where {\tiny \begin{displaymath} \mbox{\.{g}}(\boldsymbol{\theta}) = \frac{100}{\theta_0^2} \left( \begin{array}{ccccc} \theta_1+\theta_0 & \theta_1 & \theta_1 & \theta_1 & \theta_1 \\ \theta_2 & \theta_2+\theta_0 & \theta_2 & \theta_2 & \theta_2 \\ \theta_3 & \theta_3 & \theta_3+\theta_0 & \theta_3 & \theta_3 \\ \theta_4 & \theta_4 & \theta_4 & \theta_4+\theta_0 & \theta_4 \\ \theta_5 & \theta_5 & \theta_5 & \theta_5 & \theta_5+\theta_0 \end{array} \right) \end{displaymath} } % End size and {\tiny \begin{displaymath} \boldsymbol{\Sigma}(\boldsymbol{\theta}) = \left( \begin{array}{ccccc} \theta_1(1-\theta_1) & -\theta_1\theta_2 & -\theta_1\theta_3 & -\theta_1\theta_4 & -\theta_1\theta_5 \\ -\theta_1\theta_2 & \theta_2(1-\theta_2) & -\theta_2\theta_3 & -\theta_2\theta_4 & -\theta_2\theta_5 \\ -\theta_1\theta_3 & -\theta_2\theta_3 & \theta_3(1-\theta_3) & -\theta_3\theta_4 & -\theta_3\theta_5 \\ -\theta_1\theta_4 & -\theta_2\theta_4 & -\theta_3\theta_4 & \theta_4(1-\theta_4) & -\theta_4\theta_5 \\ -\theta_1\theta_5 & -\theta_2\theta_5 & -\theta_3\theta_5 & -\theta_4\theta_5 & \theta_5(1-\theta_5) \\ \end{array} \right) \end{displaymath} } % End size Estimate $\theta_j$ with $\overline{Y}_j$. \end{frame} \begin{frame}[fragile] \frametitle{Calculate $\boldsymbol{\Sigma}(\widehat{\boldsymbol{\theta}})$} {\color{blue} {\footnotesize \begin{verbatim} > # Calculate estimated Sigma(theta). Did this kind of thing earlier. > p = length(ybar) > Sighat = matrix(nrow=p,ncol=p) # Empty matrix > for(i in 1:p) { for (j in 1:p) Sighat[i,j] = -ybar[i]*ybar[j] } > for(i in 1:p) Sighat[i,i] = ybar[i]*(1-ybar[i]) > Sighat \end{verbatim} } % End size } % End color {\footnotesize \begin{verbatim} [,1] [,2] [,3] [,4] [,5] [1,] 0.225975 -0.039675000 -0.073025000 -0.0080500000 -0.0143750000 [2,] -0.039675 0.101775000 -0.024341667 -0.0026833333 -0.0047916667 [3,] -0.073025 -0.024341667 0.166863889 -0.0049388889 -0.0088194444 [4,] -0.008050 -0.002683333 -0.004938889 0.0227888889 -0.0009722222 [5,] -0.014375 -0.004791667 -0.008819444 -0.0009722222 0.0399305556 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Calculate \.g$(\widehat{\boldsymbol{\theta}})$} {\color{blue} {\footnotesize \begin{verbatim} > # Calculate estimated gdot > ybar0 = car[1]/n # Proportion buying Honda > gdot = matrix(nrow=p,ncol=p) # Empty matrix > for(i in 1:p) gdot[i,] = numeric(p)+ybar[i] # Replace each row > gdot = gdot + diag(numeric(p)+ybar0) > gdot = 100/ybar0^2 * gdot > gdot \end{verbatim} } % End size } % End color {\footnotesize \begin{verbatim} [,1] [,2] [,3] [,4] [,5] [1,] 877.26326 497.51642 497.51642 497.51642 497.51642 [2,] 165.83881 545.58564 165.83881 165.83881 165.83881 [3,] 305.23954 305.23954 684.98638 305.23954 305.23954 [4,] 33.64845 33.64845 33.64845 413.39529 33.64845 [5,] 60.08652 60.08652 60.08652 60.08652 439.83336 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Asymptotic covariance matrix of percent differences} {\color{blue} {\footnotesize \begin{verbatim} > # The approximate asymptotic covariance matrix of > # percent differences will be called V > > V = (1/n) * gdot %*% Sighat %*% t(gdot) > carnames = c("Toyota", "Nissan", "Mazda", "Mitsubishi", "Suburu") > rownames(V) = carnames; colnames(V) = carnames > V \end{verbatim} } % End size } % End color {\footnotesize \begin{verbatim} Toyota Nissan Mazda Mitsubishi Suburu Toyota 95.777160 18.105819 33.325203 3.6736445 6.5600794 Nissan 18.105819 19.855174 11.108401 1.2245482 2.1866931 Mazda 33.325203 11.108401 45.882527 2.2538785 4.0247830 Mitsubishi 3.673644 1.224548 2.253878 3.0524969 0.4436769 Suburu 6.560079 2.186693 4.024783 0.4436769 5.7994905 \end{verbatim} } % End size \end{frame} \begin{frame}[fragile] \frametitle{Confidence intervals} {\color{blue} {\footnotesize \begin{verbatim} > StdError = sqrt(diag(V)) > MarginError95 = 1.96*StdError > PercentDiff = perdif[2:6] > Lower95 = PercentDiff - MarginError95 > Upper95 = PercentDiff + MarginError95 > > # Report > rbind(car,Percent); rbind(PercentDiff,MarginError95,Lower95,Upper95) \end{verbatim} } % End size } % End color {\footnotesize \begin{verbatim} Honda Toyota Nissan Mazda Mitsubishi Suburu car 316.00 414.0 138.0 254.00 28.00 50.00 Percent 26.33 34.5 11.5 21.17 2.33 4.17 Toyota Nissan Mazda Mitsubishi Suburu PercentDiff 31.00000 -56.300000 -19.600000 -91.100000 -84.200000 MarginError95 19.18170 8.733592 13.276382 3.424394 4.720098 Lower95 11.81830 -65.033592 -32.876382 -94.524394 -88.920098 Upper95 50.18170 -47.566408 -6.323618 -87.675606 -79.479902 \end{verbatim} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf13} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf13}} \end{frame} \end{document} statclass = read.table("http://www.utstat.toronto.edu/~brunner/appliedf13/code_n_data/lecture/statclass.data",header=T) head(statclass) attach(statclass) quiz = 10 * (Q1+Q2+Q3+Q4+Q5+Q6+Q7+Q8)/8 computer = 10 * (C1+C2+C3+C4+C5+C6+C7+C8+C9)/9 midterm = MT final = F datta = cbind(quiz,computer,midterm,final); head(round(datta)) ybar = apply(datta,2,mean); ybar sigmahat = var(datta); sigmahat L = rbind(c(1,-1,0,0), c(0,1,-1,0), c(0,0,1,-1) ) n = length(quiz); n Wn = n * t(L %*% ybar) %*% solve(L%*%sigmahat%*%t(L)) %*% L%*%ybar Wn Wn = as.numeric(Wn) pvalue = 1-pchisq(Wn,df=3); pvalue %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%