% Use Beamer to produce an outline-style document instead of slides. % \documentclass[serif]{beamer} % Serif means Computer Modern math font. % \hypersetup{colorlinks,linkcolor=,urlcolor=red} % To create handout using article mode: Comment above and uncomment below (2 places) \documentclass[12pt]{article} \usepackage{beamerarticle} \usepackage{comment} \usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode \usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols % \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Categorical Data Analysis\footnote{See last slide for copyright information.}} \subtitle{STA 312: Fall 2022} \date{} % To suppress date \begin{document} % More material required for handout in article mode \title{Categorical Data Analysis\footnote{See last page for copyright information.}} \subtitle{STA 312: Fall 2022} \date{} % To suppress date \maketitle \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Variables and Cases} %\framesubtitle{Optional sub-title} \begin{itemize} \item There are $n$ \textbf{cases} (people, rats, factories, wolf packs) in a data set. \item A \textbf{variable} is a characteristic or piece of information that can be recorded for each case in the data set. \item For example cases could be patients in a hospital, and variables could be Age, Sex, Diagnosis, Have family doctor (Yes-No), Family history of heart disease (Yes-No), etc. \end{itemize} \end{frame} \begin{frame}{Variables can be Categorical, or Continuous} \begin{itemize} \item Categorical: Gender, Diagnosis, Job category, Have family doctor, Family history of heart disease, 5-year survival (Y-N) \item Some categories are ordered (birth order, health status) \item Continuous: Height, Weight, Blood pressure \item Some questions: \begin{itemize} \item Are all normally distributed variables continuous? \item Are all continuous variables quantitative? \item Are all quantitative variables continuous? \item Are there really any data sets with continuous variables? \end{itemize} \end{itemize} \end{frame} \begin{frame}{Variables can be Explanatory, or Response} \begin{itemize} \item Explanatory variables are sometimes called ``independent variables." \item The $x$ variables in regression are explanatory variables. \item Response variables are sometimes called ``dependent variables." \item The $Y$ variable in regression is the response variable. \item Sometimes the distinction is not useful: Does each twin get cancer, Yes or No? \end{itemize} \end{frame} \begin{frame}{Our main interest is in categorical variables} \begin{itemize} \item Especially categorical response variables \item In ordinary regression, outcomes are normally distributed, and so continuous. \item But often, outcomes of interest are categorical \begin{itemize} \item Buy the product, or not \item Marital status 5 years after graduation \item Survive the operation, or not. \end{itemize} \item Ordered categorical response variables, too: for example highest level of hockey ever played. \end{itemize} \end{frame} \begin{frame}{Distributions}{We will mostly use} \begin{itemize} \item Bernoulli \item Binomial \item Multinomial \item Poisson \end{itemize} \end{frame} \begin{frame}{The Poisson process}{Why the Poisson distribution is such a useful model for count data} \begin{itemize} \item Events happening randomly in space or time \item Independent increments \item For a small region or interval, \begin{itemize} \item Chance of 2 or more events is negligible \item Chance of an event roughly proportional to the size of the region or interval \end{itemize} \item Then (solve a system of differential equations), the probability of observing $x$ events in a region of size $t$ is \begin{displaymath} \frac{e^{-\lambda t}(\lambda t)^x }{x!} \mbox{ for } x=0,1,\ldots \end{displaymath} \end{itemize} \end{frame} \begin{frame}{Poisson process examples}{Some variables that have a Poisson distribution} \begin{itemize} \item Calls coming in to an emergency number \item Customers arriving in a given time period \item Number of raisins in a loaf of raisin bread \item Number of bomb craters in a region after a bombing raid, London WWII \item In a jar of peanut butter \ldots \end{itemize} \end{frame} \begin{frame}{Steps in the process of statistical analysis} {One possible approach} \begin{itemize} \item Consider a fairly realistic example or problem \item Decide on a statistical model \item Perhaps decide sample size \item Acquire data \item Examine and clean the data; generate displays and descriptive statistics \item Estimate parameters, perhaps by maximum likelihood \item Carry out tests, compute confidence intervals, or both \item Perhaps re-consider the model and go back to estimation \item Based on the results of inference, draw conclusions about the example or problem \end{itemize} \end{frame} \begin{frame}{Coffee taste test} A fast food chain is considering a change in the blend of coffee beans they use to make their coffee. To determine whether their customers prefer the new blend, the company plans to select a random sample of $n=100$ coffee-drinking customers and ask them to taste coffee made with the new blend and with the old blend, in cups marked ``$A$" and ``$B$." Half the time the new blend will be in cup $A$, and half the time it will be in cup $B$. Management wants to know if there is a difference in preference for the two blends. \end{frame} \begin{frame}{Statistical model} Letting $\pi$ denote the probability that a consumer will choose the new blend, treat the data $Y_1, \ldots, Y_n$ as a random sample from a Bernoulli distribution. That is, independently for $i=1, \ldots, n$, \begin{displaymath} P(y_i|\pi) = \pi^{y_i} (1-\pi)^{1-y_i} \end{displaymath} for $y_i=0$ or $y_i=1$, and zero otherwise. % The conditional probability notation is not in the book (I believe). \vspace{10mm} Note that $Y = \sum_{i=1}^n Y_i$ is the number of consumers who choose the new blend. Because $Y \sim B(n,\pi)$, the whole experiment could also be treated as a single observation from a Binomial. \end{frame} \begin{frame}{Find the MLE of $\pi$}{Show your work} Maximize the log likelihood. \begin{eqnarray*} \frac{\partial}{\partial\pi} \log \ell & = & \frac{\partial}{\partial\pi} \log\left(\prod_{i=1}^n P(y_i|\pi) \right) \\ & = & \frac{\partial}{\partial\pi} \log\left(\prod_{i=1}^n \pi^{y_i} (1-\pi)^{1-y_i} \right) \\ & = & \frac{\partial}{\partial\pi} \log\left(\pi^{\sum_{i=1}^n y_i} (1-\pi)^{n-\sum_{i=1}^n y_i}\right) \\ & = & \frac{\partial}{\partial\pi}\left((\sum_{i=1}^n y_i)\log\pi + (n-\sum_{i=1}^n y_i)\log (1-\pi) \right) \\ & = & \frac{\sum_{i=1}^n y_i}{\pi} - \frac{n-\sum_{i=1}^n y_i}{1-\pi} \end{eqnarray*} \end{frame} \begin{frame}{Setting the derivative to zero,} \begin{eqnarray*} \frac{\sum_{i=1}^n y_i}{\pi} = \frac{n-\sum_{i=1}^n y_i}{1-\pi} & \Rightarrow & (1-\pi)\sum_{i=1}^n y_i = \pi (n-\sum_{i=1}^n y_i) \\ & \Rightarrow & \sum_{i=1}^n y_i-\pi\sum_{i=1}^n y_i = n\pi - \pi\sum_{i=1}^n y_i \\ & \Rightarrow & \sum_{i=1}^n y_i = n\pi \\ & \Rightarrow & \pi = \frac{\sum_{i=1}^n y_i}{n} = \overline{y} = p \end{eqnarray*} So it looks like the MLE is the sample proportion. Carrying out the second derivative test to be sure, \end{frame} \begin{frame}{Second derivative test} \begin{eqnarray*} \frac{\partial^2\log \ell}{\partial\pi^2} & = & \frac{\partial}{\partial\pi} \left(\frac{\sum_{i=1}^n y_i}{\pi} - \frac{n-\sum_{i=1}^n y_i}{1-\pi} \right) \\ & = & \frac{-\sum_{i=1}^n y_i}{\pi^2} --- \frac{n-\sum_{i=1}^n y_i}{(1-\pi)^2} \\ & = & -n\left(\frac{1-\overline{y}}{(1-\pi)^2} + \frac{\overline{y}}{\pi^2} \right) < 0 \\ \end{eqnarray*} \vspace{5mm} Concave down, maximum, and $\widehat{\pi}=\overline{y}=p$. \end{frame} \begin{frame}[fragile]{Numerical estimate} % Note use of fragile to make verbatim work. Suppose 60 of the 100 consumers prefer the new blend. Give a point estimate the parameter $\pi$. Your answer is a number. \vspace{10mm} \begin{verbatim} > p = 60/100; p [1] 0.6 \end{verbatim} %\verb:f(x): \end{frame} \begin{frame} \frametitle{Carry out a test to answer the question} \framesubtitle{Is there a difference in preference for the two blends?} {Start by stating the null hypothesis} \begin{itemize} \item $H_0: \pi=0.50$ \item $H_1: \pi \neq 0.50$ \item A case could be made for a one-sided test, but we'll stick with two-sided. \item $\alpha=0.05$ as usual. \item Central Limit Theorem says $\widehat{\pi}=\overline{Y}$ is approximately normal with mean $\pi$ and variance $\frac{\pi(1-\pi)}{n}$. \end{itemize} \end{frame} \begin{frame}[fragile]{Several valid test statistics for $H_0: \pi=\pi_0$ are available} {Two of them are} % Which one do you like more? Why? \begin{displaymath} Z_1 = \frac{\sqrt{n}(p-\pi_0)}{\sqrt{\pi_0(1-\pi_0)}} \end{displaymath} and \begin{displaymath} Z_2 = \frac{\sqrt{n}(p-\pi_0)}{\sqrt{p(1-p)}} \end{displaymath} \vspace{10mm} What is the critical value? Your answer is a number. \begin{verbatim} > alpha = 0.05 > qnorm(1-alpha/2) [1] 1.959964 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Calculate the test statistic(s)} \framesubtitle{and the $p$-value(s)} \begin{verbatim} > pi0 = .5; p = .6; n = 100 > Z1 = sqrt(n)*(p-pi0)/sqrt(pi0*(1-pi0)); Z1 [1] 2 > pval1 = 2 * (1-pnorm(Z1)); pval1 [1] 0.04550026 > > Z2 = sqrt(n)*(p-pi0)/sqrt(p*(1-p)); Z2 [1] 2.041241 > pval2 = 2 * (1-pnorm(Z2)); pval2 [1] 0.04122683 \end{verbatim} \end{frame} \begin{frame} \frametitle{Conclusions} %\framesubtitle{In symbols and words: Words are more important} \begin{itemize} \item Do you reject $H_0$? \emph{Yes, just barely.} \item Isn't the $\alpha=0.05$ significance level pretty arbitrary? \emph{Yes, but if people insist on a Yes or No answer, this is what you give them.} \item What do you conclude, in symbols? $\pi \neq 0.50$. \emph{Specifically,} $\pi > 0.50$. \item What do you conclude, in plain language? Your answer is a statement about coffee. \emph{More consumers prefer the new blend of coffee beans.} \item Can you really draw directional conclusions when all you did was reject a non-directional null hypothesis? \emph{Yes. Decompose the two-sided size $\alpha$ test into two one-sided tests of size $\alpha/2$. This approach works in general.} \end{itemize} It is very important to state directional conclusions, and state them clearly in terms of the subject matter. \textbf{Say what happened!} If you are asked state the conclusion in plain language, your answer \emph{must} be free of statistical mumbo-jumbo. \end{frame} \begin{frame} \frametitle{What about negative conclusions?} \framesubtitle{What would you say if $Z=1.84$?} Here are two possibilities, in plain languages. \begin{itemize} \item ``This study does not provide clear evidence that consumers prefer one blend of coffee beans over the other." \item ``The results are consistent with no difference in preference for the two coffee bean blends." \end{itemize} \vspace{5mm} In this course, we will not just casually \emph{accept} the null hypothesis. % We are taking the side of Fisher over Neyman and Pearson in an old and very nasty theoretical dispute. \end{frame} \begin{frame} \frametitle{Confidence Intervals} \framesubtitle{Approximately for large $n$,} \begin{eqnarray*} 1-\alpha & \approx & Pr\{ -z_{\alpha/2} < Z_2 < z_{\alpha/2} \} \\ & = & Pr\left\{ -z_{\alpha/2} < \frac{\sqrt{n}(p-\pi)}{\sqrt{p(1-p)}} < z_{\alpha/2} \right\} \\ & = & Pr\left\{ p - z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} < \pi < p + z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} \right\} \end{eqnarray*} \begin{itemize} \item Could express this as $p \pm z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}}$ \item $z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}}$ is sometimes called the \emph{margin of error}. \item If $\alpha=0.05$, it's the 95\% margin of error. \end{itemize} \end{frame} \begin{frame} \frametitle{Give a 95\% confidence interval for the taste test data.} \framesubtitle{The answer is a pair of numbers. Show some work.} \begin{eqnarray*} & & \left(p - z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} ~~,~~ p + z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} \right) \\ & & \\ &=& \left(0.60 - 1.96\sqrt{\frac{0.6\times 0.4}{100}} ~~,~~ 0.60 + 1.96\sqrt{\frac{0.6\times 0.4}{100}} \right) \\ & & \\ &=& (0.504,0.696) \end{eqnarray*} In a report, you could say \begin{itemize} \item The estimated proportion preferring the new coffee bean blend is $0.60 \pm 0.096$, or \item ``Sixty percent of consumers preferred the new blend. These results are expected to be accurate within 10 percentage points, 19 times out of 20." \end{itemize} \end{frame} \begin{frame} \frametitle{Meaning of the confidence interval} \begin{itemize} \item We calculated a 95\% confidence interval of $(0.504,0.696)$ for $\pi$. \item Does this mean $Pr\{ 0.504 < \pi < 0.696 \}=0.95$? \item No! The quantities $0.504$, $0.696$ and $\pi$ are all constants, so $Pr\{ 0.504 < \pi < 0.696 \}$ is either zero or one. \item The endpoints of the confidence interval are random variables, and the numbers $0.504$ and $0.696$ are \emph{realizations} of those random variables, arising from a particular random sample. \item Meaning of the probability statement: If we were to calculate an interval in this manner for a large number of random samples, the interval would contain the true parameter around $95\%$ of the time. \item So we sometimes say that we are ``$95\%$ confident" that $0.504 < \pi < 0.696$. \end{itemize} \end{frame} \begin{frame} \frametitle{Confidence intervals (regions) correspond to tests} \framesubtitle{Recall $Z_1 = \frac{\sqrt{n}(p-\pi_0)}{\sqrt{\pi_0(1-\pi_0)}}$ and $Z_2 = \frac{\sqrt{n}(p-\pi_0)}{\sqrt{p(1-p)}}$.} From the derivation of the confidence interval, \begin{displaymath} -z_{\alpha/2} < Z_2 < z_{\alpha/2} \end{displaymath} if and only if \begin{displaymath} p - z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} < \pi_0 < p + z_{\alpha/2}\sqrt{\frac{p(1-p)}{n}} \end{displaymath} \begin{itemize} \item So the confidence interval consists of those parameter values $\pi_0$ for which $H_0: \pi=\pi_0$ is \emph{not} rejected. \item That is, the null hypothesis is rejected at significance level $\alpha$ if and only if the value given by the null hypothesis is outside the $(1-\alpha)\times 100\%$ confidence interval. \item There is a confidence interval corresponding to $Z_1$ too. Maybe it's better -- See Chapter~One. \item In general, any test can be inverted to obtain a confidence region. \end{itemize} \end{frame} \begin{comment} Cutting out power in the whole course! \begin{frame} \frametitle{Selecting sample size} \begin{itemize} \item Where did that $n=100$ come from? \item Probably off the top of someone's head. \item We can (and should) be more systematic. \item Sample size can be selected \begin{itemize} \item To achieve a desired margin of error \item To achieve a desired statistical power \item In other reasonable ways \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Power} The power of a test is the probability of rejecting $H_0$ when $H_0$ is false. \begin{itemize} \item More power is good. \item Power is not just one number. It is a \emph{function} of the parameter. \item Usually, \begin{itemize} \item For any $n$, the more incorrect $H_0$ is, the greater the power. \item For any parameter value satisfying the alternative hypothesis, the larger $n$ is, the greater the power. \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Statistical power analysis} \framesubtitle{To select sample size} \begin{itemize} \item Pick an effect you'd like to be able to detect -- a parameter value such that $H_0$ is false. It should be just over the boundary of interesting and meaningful. \item Pick a desired power, a probability with which you'd like to be able to detect the effect by rejecting the null hypothesis. \item Start with a fairly small $n$ and calculate the power. Increase the sample size until the desired power is reached. \end{itemize} \vspace{5mm} There are two main issues. \vspace{5mm} \begin{itemize} \item What is an ``interesting" or ``meaningful" parameter value? \item How do you calculate the probability of rejecting $H_0$? \end{itemize} \end{frame} \begin{frame} \frametitle{Calculating power for the test of a single proportion} \framesubtitle{True parameter value is $\pi$} \begin{displaymath} Z_1 = \frac{\sqrt{n}(p-\pi_0)}{\sqrt{\pi_0(1-\pi_0)}} \end{displaymath} {\tiny \begin{eqnarray*} \mbox{Power} &=& 1-Pr\{-z_{\alpha/2} < Z_1 < z_{\alpha/2} \} \\ &=& 1-Pr\left\{-z_{\alpha/2} < \frac{\sqrt{n}(p-\pi_0)}{\sqrt{\pi_0(1-\pi_0)}} < z_{\alpha/2} \right\} \\ &=& \ldots \\ &=& 1-Pr\left\{\frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} - z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} < {\color{red} \frac{\sqrt{n}(p-\pi)}{\sqrt{\pi(1-\pi)}} } \right. \\ & & \hspace{8mm} < \left. \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} + z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right\}\\ &\approx& 1-Pr\left\{\frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} - z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} < {\color{red} Z } < \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} + z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right\} \\ &=& 1 - \Phi\left( \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} + z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right) + \Phi\left( \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} - z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right), \end{eqnarray*} } where $\Phi(\cdot)$ is the cumulative distribution function of the standard normal. \end{frame} \begin{frame}[fragile] \frametitle{An R function to calculate approximate power} \framesubtitle{For the test of a single proportion} {\tiny \begin{displaymath} \mbox{Power} = 1 - \Phi\left( \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} + z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right) + \Phi\left( \frac{\sqrt{n}(\pi_0-\pi)}{\sqrt{\pi(1-\pi)}} - z_{\alpha/2}\sqrt{\frac{\pi_0(1-\pi_0)}{\pi(1-\pi)}} \right) \end{displaymath} } %{\small \begin{verbatim} Z1power = function(pi,n,pi0=0.50,alpha=0.05) { a = sqrt(n)*(pi0-pi)/sqrt(pi*(1-pi)) b = qnorm(1-alpha/2) * sqrt(pi0*(1-pi0)/(pi*(1-pi))) Z1power = 1 - pnorm(a+b) + pnorm(a-b) Z1power } # End of function Z1power \end{verbatim} %} \end{frame} \begin{frame}[fragile] \frametitle{Some numerical examples} \begin{verbatim} > Z1power(0.50,100) [1] 0.05 > > Z1power(0.55,100) [1] 0.168788 > Z1power(0.60,100) [1] 0.5163234 > Z1power(0.65,100) [1] 0.8621995 > Z1power(0.40,100) [1] 0.5163234 > Z1power(0.55,500) [1] 0.6093123 > Z1power(0.55,1000) [1] 0.8865478 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Find smallest sample size needed to detect $\pi=0.55$ as different from $\pi_0=0.50$ with probability at least 0.80} \begin{verbatim} > samplesize = 50 > power=Z1power(pi=0.55,n=samplesize); power [1] 0.1076602 > while(power < 0.80) + { + samplesize = samplesize+1 + power = Z1power(pi=0.55,n=samplesize) + } > samplesize; power [1] 783 [1] 0.8002392 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Find smallest sample size needed to detect $\pi=0.60$ as different from $\pi_0=0.50$ with probability at least 0.80} \begin{verbatim} > samplesize = 50 > power=Z1power(pi=0.60,n=samplesize); power [1] 0.2890491 > while(power < 0.80) + { + samplesize = samplesize+1 + power = Z1power(pi=0.60,n=samplesize) + } > samplesize; power [1] 194 [1] 0.8003138 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Conclusions from the power analysis} %\framesubtitle{} \begin{itemize} \item Detecting true $\pi=0.60$ as different from $0.50$ is a reasonable goal. \item Power with $n=100$ is barely above one half -- pathetic. \item As Fisher said, ``To call in the statistician after the experiment is done may be no more than asking him to perform a postmortem examination: he may be able to say what the experiment died of." \item $n=200$ is much better. \item How about $n=250$? \end{itemize} \begin{verbatim} > Z1power(pi=0.60,n=250) [1] 0.8901088 \end{verbatim} It depends on what you can afford, but I like $n=250$. \end{frame} \begin{frame} \frametitle{What is required of the scientist} \framesubtitle{Who wants to select sample size by power analysis} The scientist must specify \begin{itemize} \item Parameter values that he or she wants to be able to detect as different from $H_0$ value. \item Desired power (probability of detection) \end{itemize} \vspace{20mm} It's not always easy for the scientist to think in terms of the parameters of a statistical model. \end{frame} End commenting out power \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This document was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/312f22} {\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/312f22}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% These two additional slides may be good for STA2101 downtown. \begin{frame}[fragile] \frametitle{Check power calculations by simulation} \framesubtitle{First develop and illustrate the code} {\small \begin{verbatim} > pi = 0.50; pi0 = 0.50; n = 100; m = 10 > p = rbinom(m,n,pi)/n; p [1] 0.49 0.49 0.56 0.51 0.48 0.53 0.51 0.40 0.54 0.51 > Z1 = sqrt(n)*(p-pi0)/sqrt(pi0*(1-pi0)) > rbind(p,Z1) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] p 0.49 0.49 0.56 0.51 0.48 0.53 0.51 0.4 0.54 0.51 Z1 -0.20 -0.20 1.20 0.20 -0.40 0.60 0.20 -2.0 0.80 0.20 > sig = (abs(Z1)>1.96); sig [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE \end{verbatim} } \end{frame} \begin{frame}[fragile] \frametitle{Now real simulation: } \framesubtitle{First estimated probabiliity should equal about 0.05 because $\pi=\pi_0$} {\small \begin{verbatim} > pi = 0.50; pi0 = 0.50; n = 100; m = 10000 > p = rbinom(m,n,pi)/n > Z1 = sqrt(n)*(p-pi0)/sqrt(pi0*(1-pi0)) > sig = (abs(Z1)>1.96) > sum(sig)/m [1] 0.055 > # Power calculation for pi=0.55 said power = 0.168788 > pi = 0.55; pi0 = 0.50; n = 100; m = 10000 > p = rbinom(m,n,pi)/n > Z1 = sqrt(n)*(p-pi0)/sqrt(pi0*(1-pi0)) > sig = (abs(Z1)>1.96) > sum(sig)/m [1] 0.1837 \end{verbatim} } \end{frame}