% Stats intro for Applied Stat I % Notes and comments at the end \documentclass[serif]{beamer} % Serif for Computer Modern math font. % \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} % To create handout using article mode: Comment above and uncomment below (2 places) %\documentclass[12pt]{article} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols % \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Introduction\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2014} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading} \framesubtitle{Optional} \begin{itemize} \item Chapter 1 of \emph{Linear models with R} \item Chapter 1 of Davison's \emph{Statistical models}: Data, and probability models for data. \end{itemize} \end{frame} \begin{frame} \frametitle{Goal of statistical analysis} The goal of statistical analysis is to draw reasonable conclusions from noisy numerical data. \end{frame} \begin{frame}{Steps in the process of statistical analysis} {One approach} \begin{itemize} \item Consider a fairly realistic example or problem. \item Decide on a statistical model. \item Perhaps decide sample size. \item Acquire data. \item Examine and clean the data; generate displays and descriptive statistics. \item Estimate model parameters, for example by maximum likelihood. \item Carry out tests, compute confidence intervals, or both. \item Perhaps re-consider the model and go back to estimation. \item Based on the results of estimation and inference, draw conclusions about the example or problem. \end{itemize} \end{frame} \begin{frame} \frametitle{What is a statistical model?} \framesubtitle{You should always be able to state the model.} \pause A \emph{statistical model} is a set of assertions that partly specify the probability distribution of the observable data. The specification may be direct or indirect. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \end{itemize} \pause Is the model the same thing as the \emph{truth}? \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause The parameter space is $\{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \pause The parameter space is $\{(\beta_0, \ldots, \beta_k, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} \begin{frame}{Coffee taste test} A fast food chain is considering a change in the blend of coffee beans they use to make their coffee. To determine whether their customers prefer the new blend, the company plans to select a random sample of $n=100$ coffee-drinking customers and ask them to taste coffee made with the new blend and with the old blend, in cups marked ``$A$" and ``$B$." Half the time the new blend will be in cup $A$, and half the time it will be in cup $B$. Management wants to know if there is a difference in preference for the two blends. \end{frame} \begin{frame}{Statistical model} Letting $\theta$ denote the probability that a consumer will choose the new blend, treat the data $X_1, \ldots, X_n$ as a random sample from a Bernoulli distribution. That is, independently for $i=1, \ldots, n$, \begin{displaymath} P(x_i|\theta) = \theta^{x_i} (1-\theta)^{1-x_i} \end{displaymath} for $x_i=0$ or $x_i=1$, and zero otherwise. % The conditional probability notation is not in the book (I believe). \vspace{5mm} \pause \begin{itemize} \item Parameter space is the interval from zero to one. \item $\theta$ could be estimated by maximum likelihood. \item Large-sample tests and confidence intervals are available. \end{itemize} % Note that $Y = \sum_{i=1}^n Y_i$ is the number of consumers who choose the new blend. Because $Y \sim B(n,\theta)$, the whole experiment could also be treated as a single observation from a Binomial. \end{frame} % Had a slide showing calculation of MLE -- deleted. \begin{frame} \frametitle{Tests of statistical hypotheses} %\framesubtitle{} \begin{itemize} \item Model: $X \sim F_\theta $ \item $X$ is the data vector, and $\mathcal{X}$ is the sample space: $X \in \mathcal{X}$ \item $\theta$ is the parameter, and $\Theta$ is the parameter space: $\theta \in \Theta$ \pause \item Null hypothesis is $H_0: \theta \in \Theta_0 \mbox{ v.s. } H_A: \theta \in \Theta \cap \Theta_0^c$ \item Meaning of the \emph{null} hypothesis is that \emph{nothing} interesting is happening. \pause \item $\mathcal{C} \subset \mathcal{X}$ is the \emph{critical region}. Reject $H_0$ in favour of $H_A$ when $X \in \mathcal{C}$. \pause \item Significance level $\alpha$ (\emph{size} of the test) is the maximum probability of rejecting $H_0$ when $H_0$ is true. \pause \item $p$-value is the smallest value of $\alpha$ for which $H_0$ can be rejected. \item Small $p$-values are interpreted as providing stronger evidence against the null hypothesis. \end{itemize} \end{frame} \begin{frame} \frametitle{Type I and Type II error} \framesubtitle{A Neyman-Pearson idea rather than Fisher} \begin{itemize} \item Type I error is to reject $H_0$ when $H_0$ is true. \item Type II error is to \emph{not} reject $H_0$ when $H_0$ is false. \pause % \item Can't minimize the probability of both types of error at once. % \item So hold the maximum probability of a Type I error (the significance level $\alpha$) to a ``small" value like $\alpha=0.05$, and then seek tests that minimize the probability of Type II error. \item $1-Pr\{$Type II Error$\}$ is called \emph{power}. \item Power may be used to select sample size. \end{itemize} \end{frame} \begin{frame} \frametitle{Carry out a test to determine which brand of coffee is preferred} \framesubtitle{Recall the model is $X_1, \ldots, X_n \stackrel{i.i.d.}{\sim} B(1,\theta)$} Start by stating the null hypothesis. \pause \begin{itemize} \item $H_0: \theta=0.50$ \item $H_1: \theta \neq 0.50$ \pause \item Could you make a case for a one-sided test? \pause \item $\alpha=0.05$ as usual. \pause \item Central Limit Theorem says $\widehat{\theta}=\overline{X}$ is approximately normal with mean $\theta$ and variance $\frac{\theta(1-\theta)}{n}$. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Several valid test statistics for $H_0: \theta=\theta_0$ are available} {Two of them are} % Which one do you like more? Why? \begin{displaymath} Z_1 = \frac{\sqrt{n}(\overline{X}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}} \end{displaymath} and \begin{displaymath} Z_2 = \frac{\sqrt{n}(\overline{X}-\theta_0)}{\sqrt{\overline{X}(1-\overline{Y})}} \end{displaymath} \vspace{10mm} \pause What is the critical value? Your answer is a number. \begin{verbatim} > alpha = 0.05 > qnorm(1-alpha/2) [1] 1.959964 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Calculate the test statistic and the $p$-value for each test} \framesubtitle{Suppose 60 out of 100 preferred the new blend} \pause $ Z_1 = \frac{\sqrt{n}(\overline{X}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}}$ \pause \begin{verbatim} > theta0 = .5; ybar = .6; n = 100 > Z1 = sqrt(n)*(ybar-theta0)/sqrt(theta0*(1-theta0)); Z1 [1] 2 > pval1 = 2 * (1-pnorm(Z1)); pval1 [1] 0.04550026 \end{verbatim} \pause $Z_2 = \frac{\sqrt{n}(\overline{X}-\theta_0)}{\sqrt{\overline{X}(1-\overline{Y})}}$ \pause \begin{verbatim} > Z2 = sqrt(n)*(ybar-theta0)/sqrt(ybar*(1-ybar)); Z2 [1] 2.041241 > pval2 = 2 * (1-pnorm(Z2)); pval2 [1] 0.04122683 \end{verbatim} \end{frame} \begin{frame} \frametitle{Conclusions} %\framesubtitle{In symbols and words: Words are more important} \begin{itemize} \item Do you reject $H_0$? \pause \emph{Yes, just barely.} \pause \item Isn't the $\alpha=0.05$ significance level pretty arbitrary? \pause \emph{Yes, but if people insist on a Yes or No answer, this is what you give them.} \pause \item What do you conclude, in symbols? \pause $\theta \neq 0.50$. \emph{Specifically,} $\theta > 0.50$. \pause \item What do you conclude, in plain language? Your answer is a statement about coffee. \pause \emph{More consumers prefer the new blend of coffee beans.} \pause \item Can you really draw directional conclusions when all you did was reject a non-directional null hypothesis? \pause \emph{Yes. Decompose the two-sided size $\alpha$ test into two one-sided tests of size $\alpha/2$. This approach works in general.} \end{itemize} \pause It is very important to state directional conclusions, and state them clearly in terms of the subject matter. \textbf{Say what happened!} If you are asked state the conclusion in plain language, your answer \emph{must} be free of statistical mumbo-jumbo. \end{frame} \begin{frame} \frametitle{What about negative conclusions?} \framesubtitle{What would you say if $Z=1.84$?} \pause Here are two possibilities, in plain language. \begin{itemize} \item ``This study does not provide clear evidence that consumers prefer one blend of coffee beans over the other." \pause \item ``The results are consistent with no difference in preference for the two coffee bean blends." \end{itemize} \vspace{5mm} \pause In this course, we will not just casually \emph{accept} the null hypothesis. % We are taking the side of Fisher over Neyman and Pearson in an old and very nasty theoretical dispute. \end{frame} \begin{frame} \frametitle{Confidence intervals} \framesubtitle{Usually for individual parameters} \begin{itemize} \item Point estimates may give a false sense of precision. \item We should provide a margin of probable error as well. \end{itemize} \end{frame} \begin{frame} \frametitle{Give a 95\% confidence interval for the taste test data.} \framesubtitle{The answer is a pair of numbers. Show some work.} \pause \begin{eqnarray*} & & \left(\overline{x} - z_{\alpha/2}\sqrt{\frac{\overline{x}(1-\overline{x})}{n}} ~~,~~ \overline{x} + z_{\alpha/2}\sqrt{\frac{\overline{x}(1-\overline{x})}{n}} \right) \\ & & \\ &=& \left(0.60 - 1.96\sqrt{\frac{0.6\times 0.4}{100}} ~~,~~ 0.60 + 1.96\sqrt{\frac{0.6\times 0.4}{100}} \right) \\ & & \\ &=& (0.504,0.696) \end{eqnarray*} \pause In a report, you could say \begin{itemize} \item The estimated proportion preferring the new coffee bean blend is $0.60 \pm 0.096$, or \item ``Sixty percent of consumers preferred the new blend. These results are expected to be accurate within 10 percentage points, 19 times out of 20." \end{itemize} \end{frame} \begin{frame} \frametitle{Meaning of the confidence interval} \begin{itemize} \item We calculated a 95\% confidence interval of $(0.504,0.696)$ for $\theta$. \item Does this mean $Pr\{ 0.504 < \theta < 0.696 \}=0.95$? \pause \item No! The quantities $0.504$, $0.696$ and $\theta$ are all constants, so $Pr\{ 0.504 < \theta < 0.696 \}$ is either zero or one. \pause \item The endpoints of the confidence interval are random variables, and the numbers $0.504$ and $0.696$ are \emph{realizations} of those random variables, arising from a particular random sample. \pause \item Meaning of the probability statement: If we were to calculate an interval in this manner for a large number of random samples, the interval would contain the true parameter around $95\%$ of the time. \pause \item The confidence interval is a guess, and the guess is either right or wrong. But the guess is the constructed by a method that is right 95\% of the time. % Take it or leave it. \end{itemize} \end{frame} \begin{frame} \frametitle{More on confidence intervals} %\framesubtitle{Usually, confidence \emph{intervals} for single parameters} \begin{itemize} \item Can have confidence \emph{regions} for the entire parameter vector or multi-dimensional functions of the parameter vector. \item Confidence regions correspond to tests. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf14} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf14}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} \begin{itemize} \item $R=R(X)$ is a function of the sample data, and \emph{not} of any unknown parameters. Otherwise you could not compute it. \item Probability calculation applies \emph{before} data are collected \item Once a particular data vector $x$ is obtained, can calculate a particular region $r=R(x)$. \item The region $r$ \item \item \pause \item \end{itemize} \begin{frame} \frametitle{Meaning of the confidence interval} \begin{itemize} \item We calculated a 95\% confidence interval of $(0.504,0.696)$ for $\theta$. \item Does this mean $Pr\{ 0.504 < \theta < 0.696 \}=0.95$? \pause \item No! The quantities $0.504$, $0.696$ and $\theta$ are all constants, so $Pr\{ 0.504 < \theta < 0.696 \}$ is either zero or one. \pause \item The endpoints of the confidence interval are random variables, and the numbers $0.504$ and $0.696$ are \emph{realizations} of those random variables, arising from a particular random sample. \pause \item Meaning of the probability statement: If we were to calculate an interval in this manner for a large number of random samples, the interval would contain the true parameter around $95\%$ of the time. \pause \item So we sometimes say that we are ``$95\%$ confident" that $0.504 < \theta < 0.696$. \end{itemize} \end{frame} \begin{frame} \frametitle{Confidence intervals (regions) correspond to tests} \framesubtitle{Recall $Z_1 = \frac{\sqrt{n}(\overline{Y}-\theta_0)} {\sqrt{\theta_0(1-\theta_0)}}$ and $Z_2 = \frac{\sqrt{n} (\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}}$.} \pause From the derivation of the confidence interval, \begin{displaymath} -z_{\alpha/2} < Z_2 < z_{\alpha/2} \end{displaymath} if and only if \begin{displaymath} \overline{Y} - z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} < \theta_0 < \overline{Y} + z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} \end{displaymath} \pause \begin{itemize} \item So the confidence interval consists of those parameter values $\theta_0$ for which $H_0: \theta=\theta_0$ is \emph{not} rejected. \pause \item That is, the null hypothesis is rejected at significance level $\alpha$ if and only if the value given by the null hypothesis is outside the $(1-\alpha)\times 100\%$ confidence interval. \pause \item There is a confidence interval corresponding to $Z_1$ too. \item In general, any test can be inverted to obtain a confidence region. \end{itemize} \end{frame} \begin{displaymath} \begin{array}{l} Y_1, \ldots, Y_n \stackrel{i.i.d.}{\sim} F_\theta, \, \theta \in \Theta \\ \Theta_0 = \{\theta \in \Theta: \theta_1=h_1, \ldots, \theta_r=h_r \} \\ H_0: \theta \in \Theta_0 \mbox{ v.s. } H_A: \theta \in \Theta \cap \Theta_0^c, \end{array} \end{displaymath} %%%%%%%%%%%%%% \begin{frame}[fragile] % Note use of fragile to make verbatim work. \frametitle{Numerical estimate} Suppose 60 of the 100 consumers prefer the new blend. Give a point estimate of the parameter $\theta$. Your answer is a number. \vspace{10mm} \begin{verbatim} > p = 60/100; p [1] 0.6 \end{verbatim} \end{frame} \begin{frame} \frametitle{Confidence Intervals} \framesubtitle{Approximately for large $n$,} \pause \begin{eqnarray*} 1-\alpha & = & Pr\{ -z_{\alpha/2} < Z_2 < z_{\alpha/2} \} \\ \pause & \approx & Pr\left\{ -z_{\alpha/2} < \frac{\sqrt{n}(\overline{Y}-\theta)}{\sqrt{\overline{Y}(1-\overline{Y})}} < z_{\alpha/2} \right\} \\ \pause & = & Pr\left\{ \overline{Y} - z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} < \theta < \overline{Y} + z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} \right\} \end{eqnarray*} \pause \begin{itemize} \item Could express this as $\overline{Y} \pm z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}}$ \pause \item $z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}}$ is sometimes called the \emph{margin of error}. \item If $\alpha=0.05$, it's the 95\% margin of error. \end{itemize} \end{frame} \begin{frame} \frametitle{Give a 95\% confidence interval for the taste test data.} \framesubtitle{The answer is a pair of numbers. Show some work.} \pause \begin{eqnarray*} & & \left(\overline{x} - z_{\alpha/2}\sqrt{\frac{\overline{x}(1-\overline{x})}{n}} ~~,~~ \overline{x} + z_{\alpha/2}\sqrt{\frac{\overline{x}(1-\overline{x})}{n}} \right) \\ & & \\ &=& \left(0.60 - 1.96\sqrt{\frac{0.6\times 0.4}{100}} ~~,~~ 0.60 + 1.96\sqrt{\frac{0.6\times 0.4}{100}} \right) \\ & & \\ &=& (0.504,0.696) \end{eqnarray*} \pause In a report, you could say \begin{itemize} \item The estimated proportion preferring the new coffee bean blend is $0.60 \pm 0.096$, or \item ``Sixty percent of consumers preferred the new blend. These results are expected to be accurate within 10 percentage points, 19 times out of 20." \end{itemize} \end{frame} \begin{frame} \frametitle{Meaning of the confidence interval} \begin{itemize} \item We calculated a 95\% confidence interval of $(0.504,0.696)$ for $\theta$. \item Does this mean $Pr\{ 0.504 < \theta < 0.696 \}=0.95$? \pause \item No! The quantities $0.504$, $0.696$ and $\theta$ are all constants, so $Pr\{ 0.504 < \theta < 0.696 \}$ is either zero or one. \pause \item The endpoints of the confidence interval are random variables, and the numbers $0.504$ and $0.696$ are \emph{realizations} of those random variables, arising from a particular random sample. \pause \item Meaning of the probability statement: If we were to calculate an interval in this manner for a large number of random samples, the interval would contain the true parameter around $95\%$ of the time. \pause \item So we sometimes say that we are ``$95\%$ confident" that $0.504 < \theta < 0.696$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Notes and comments In 2013, Cut out non-central chisq part, added nice what's a model, parameter space Non-central chisq is in 2012 version. In 2014, based on a one-vs. 2-sample t-test rather than binomial. Version 2014a wandered back into the taste test example.