% Stats intro for Applied Stat I % Notes and comments at the end % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} % To create handout using article mode: Comment above and uncomment below (2 places) %\documentclass[12pt]{article} %\usepackage{beamerarticle} %\usepackage[colorlinks=true, pdfstartview=FitV, linkcolor=blue, citecolor=blue, urlcolor=red]{hyperref} % For live Web links with href in article mode %\usepackage{fullpage} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols % \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Introduction Based on a Simple Example\footnote{See last slide for copyright information.}} \subtitle{STA442/2101 Fall 2013} \date{} % To suppress date \begin{document} % More material required for handout in article mode %\title{Introduction Based on a Simple Example\footnote{See last slide for copyright information.}} %\subtitle{STA442/2101 Fall 2012} %\date{} % To suppress date %\maketitle \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Background Reading} \framesubtitle{Optional} \begin{itemize} \item Chapter 1 of \emph{Data analysis with SAS}: What's going on and how would you say it to a client? \item Chapter 1 of Davison's \emph{Statistical models}: Data, and probability models for data. \end{itemize} \end{frame} \begin{frame} \frametitle{Goal of statistical analysis} The goal of statistical analysis is to draw reasonable conclusions from noisy numerical data. \end{frame} \begin{frame}{Steps in the process of statistical analysis} {One approach} \begin{itemize} \item Consider a fairly realistic example or problem \item Decide on a statistical model \item Perhaps decide sample size \item Acquire data \item Examine and clean the data; generate displays and descriptive statistics \item Estimate model parameters, perhaps by maximum likelihood \item Carry out tests, compute confidence intervals, or both \item Perhaps re-consider the model and go back to estimation \item Based on the results of estimation and inference, draw conclusions about the example or problem \end{itemize} \end{frame} \begin{frame} \frametitle{What is a statistical model?} \framesubtitle{You should always be able to state the model.} \pause A \emph{statistical model} is a set of assertions that directly or indirectly specify the probability distribution of the observable data. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \end{itemize} \pause Is the model the same thing as the \emph{truth}? \end{frame} \begin{frame} \frametitle{Parameter Space} The \emph{parameter space} is the set of values that can be taken on by the parameter. \pause \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause The parameter space is $\{(\mu,\sigma^2): -\infty < \mu < \infty, \sigma^2 > 0\}$. \pause \item For $i=1, \ldots, n$, let $Y_i = \beta_0 + \beta_1 x_{i1} + \cdots + \beta_k x_{ik} + \epsilon_i$, where \begin{itemize} \item[] $\beta_0, \ldots, \beta_k$ are unknown constants. \item[] $x_{ij}$ are known constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] $\sigma^2$ is an unknown constant. \item[] $Y_1, \ldots, Y_n$ are observable random variables. \end{itemize} \pause The parameter space is $\{(\beta_0, \ldots, \beta_k, \sigma^2): -\infty < \beta_j < \infty, \sigma^2 > 0\}$. \end{itemize} \end{frame} \begin{frame}{Coffee taste test} A fast food chain is considering a change in the blend of coffee beans they use to make their coffee. To determine whether their customers prefer the new blend, the company plans to select a random sample of $n=100$ coffee-drinking customers and ask them to taste coffee made with the new blend and with the old blend, in cups marked ``$A$" and ``$B$." Half the time the new blend will be in cup $A$, and half the time it will be in cup $B$. Management wants to know if there is a difference in preference for the two blends. \end{frame} \begin{frame}{Statistical model} Letting $\theta$ denote the probability that a consumer will choose the new blend, treat the data $Y_1, \ldots, Y_n$ as a random sample from a Bernoulli distribution. That is, independently for $i=1, \ldots, n$, \begin{displaymath} P(y_i|\theta) = \theta^{y_i} (1-\theta)^{1-y_i} \end{displaymath} for $y_i=0$ or $y_i=1$, and zero otherwise. % The conditional probability notation is not in the book (I believe). \vspace{10mm} \pause Note that $Y = \sum_{i=1}^n Y_i$ is the number of consumers who choose the new blend. Because $Y \sim B(n,\theta)$, the whole experiment could also be treated as a single observation from a Binomial. \end{frame} \begin{frame}{Find the MLE of $\theta$}{Show your work} \pause Denoting the likelihood by $L(\theta)$ and the log likelihood by $\ell(\theta) = \log L(\theta)$, maximize the log likelihood. \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\theta} & = & \frac{\partial}{\partial\theta} \log\left(\prod_{i=1}^n P(y_i|\theta) \right) \\ \pause & = & \frac{\partial}{\partial\theta} \log\left(\prod_{i=1}^n \theta^{y_i} (1-\theta)^{1-y_i} \right) \\ \pause & = & \frac{\partial}{\partial\theta} \log\left(\theta^{\sum_{i=1}^n y_i} (1-\theta)^{n-\sum_{i=1}^n y_i}\right) \\ \pause & = & \frac{\partial}{\partial\theta}\left((\sum_{i=1}^n y_i)\log\theta + (n-\sum_{i=1}^n y_i)\log (1-\theta) \right) \\ \pause & = & \frac{\sum_{i=1}^n y_i}{\theta} - \frac{n-\sum_{i=1}^n y_i}{1-\theta} \end{eqnarray*} \end{frame} \begin{frame}{Setting the derivative to zero and solving} \begin{itemize} \item $\theta = \frac{\sum_{i=1}^n y_i}{n} = \overline{y}$ \pause \item Second derivative test: $ \frac{\partial^2\log \ell}{\partial\theta^2} = -n\left(\frac{1-\overline{y}}{(1-\theta)^2} + \frac{\overline{y}}{\theta^2} \right) < 0$ \pause \item Concave down, maximum, and the MLE is the sample proportion. \end{itemize} \end{frame} \begin{frame}[fragile] % Note use of fragile to make verbatim work. \frametitle{Numerical estimate} Suppose 60 of the 100 consumers prefer the new blend. Give a point estimate of the parameter $\theta$. Your answer is a number. \vspace{10mm} \begin{verbatim} > p = 60/100; p [1] 0.6 \end{verbatim} \end{frame} \begin{frame} \frametitle{Carry out a test to answer the question} \framesubtitle{Is there a difference in preference for the two blends?} \pause Start by stating the null hypothesis. \pause \begin{itemize} \item $H_0: \theta=0.50$ \item $H_1: \theta \neq 0.50$ \pause \item A case could be made for a one-sided test, but we'll stick with two-sided. \pause \item $\alpha=0.05$ as usual. \pause \item Central Limit Theorem says $\widehat{\theta}=\overline{Y}$ is approximately normal with mean $\theta$ and variance $\frac{\theta(1-\theta)}{n}$. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Several valid test statistics for $H_0: \theta=\theta_0$ are available} {Two of them are} % Which one do you like more? Why? \begin{displaymath} Z_1 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}} \end{displaymath} and \begin{displaymath} Z_2 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}} \end{displaymath} \vspace{10mm} \pause What is the critical value? Your answer is a number. \begin{verbatim} > alpha = 0.05 > qnorm(1-alpha/2) [1] 1.959964 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Calculate the test statistic and the $p$-value for each test} % \framesubtitle{Note: The R code uses $p$ for the sample proportion} \begin{verbatim} > theta0 = .5; ybar = .6; n = 100 > Z1 = sqrt(n)*(ybar-theta0)/sqrt(theta0*(1-theta0)); Z1 [1] 2 > pval1 = 2 * (1-pnorm(Z1)); pval1 [1] 0.04550026 \end{verbatim} \pause \begin{verbatim} > Z2 = sqrt(n)*(ybar-theta0)/sqrt(ybar*(1-ybar)); Z2 [1] 2.041241 > pval2 = 2 * (1-pnorm(Z2)); pval2 [1] 0.04122683 \end{verbatim} \end{frame} \begin{frame} \frametitle{Conclusions} %\framesubtitle{In symbols and words: Words are more important} \begin{itemize} \item Do you reject $H_0$? \pause \emph{Yes, just barely.} \pause \item Isn't the $\alpha=0.05$ significance level pretty arbitrary? \pause \emph{Yes, but if people insist on a Yes or No answer, this is what you give them.} \pause \item What do you conclude, in symbols? \pause $\theta \neq 0.50$. \emph{Specifically,} $\theta > 0.50$. \pause \item What do you conclude, in plain language? Your answer is a statement about coffee. \pause \emph{More consumers prefer the new blend of coffee beans.} \pause \item Can you really draw directional conclusions when all you did was reject a non-directional null hypothesis? \pause \emph{Yes. Decompose the two-sided size $\alpha$ test into two one-sided tests of size $\alpha/2$. This approach works in general.} \end{itemize} \pause It is very important to state directional conclusions, and state them clearly in terms of the subject matter. \textbf{Say what happened!} If you are asked state the conclusion in plain language, your answer \emph{must} be free of statistical mumbo-jumbo. \end{frame} \begin{frame} \frametitle{What about negative conclusions?} \framesubtitle{What would you say if $Z=1.84$?} \pause Here are two possibilities, in plain language. \begin{itemize} \item ``This study does not provide clear evidence that consumers prefer one blend of coffee beans over the other." \pause \item ``The results are consistent with no difference in preference for the two coffee bean blends." \end{itemize} \vspace{5mm} \pause In this course, we will not just casually \emph{accept} the null hypothesis. % We are taking the side of Fisher over Neyman and Pearson in an old and very nasty theoretical dispute. \end{frame} \begin{frame} \frametitle{Confidence Intervals} \framesubtitle{Approximately for large $n$,} \pause \begin{eqnarray*} 1-\alpha & = & Pr\{ -z_{\alpha/2} < Z_2 < z_{\alpha/2} \} \\ \pause & \approx & Pr\left\{ -z_{\alpha/2} < \frac{\sqrt{n}(\overline{Y}-\theta)}{\sqrt{\overline{Y}(1-\overline{Y})}} < z_{\alpha/2} \right\} \\ \pause & = & Pr\left\{ \overline{Y} - z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} < \theta < \overline{Y} + z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} \right\} \end{eqnarray*} \pause \begin{itemize} \item Could express this as $\overline{Y} \pm z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}}$ \pause \item $z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}}$ is sometimes called the \emph{margin of error}. \item If $\alpha=0.05$, it's the 95\% margin of error. \end{itemize} \end{frame} \begin{frame} \frametitle{Give a 95\% confidence interval for the taste test data.} \framesubtitle{The answer is a pair of numbers. Show some work.} \pause \begin{eqnarray*} & & \left(\overline{y} - z_{\alpha/2}\sqrt{\frac{\overline{y}(1-\overline{y})}{n}} ~~,~~ \overline{y} + z_{\alpha/2}\sqrt{\frac{\overline{y}(1-\overline{y})}{n}} \right) \\ & & \\ &=& \left(0.60 - 1.96\sqrt{\frac{0.6\times 0.4}{100}} ~~,~~ 0.60 + 1.96\sqrt{\frac{0.6\times 0.4}{100}} \right) \\ & & \\ &=& (0.504,0.696) \end{eqnarray*} \pause In a report, you could say \begin{itemize} \item The estimated proportion preferring the new coffee bean blend is $0.60 \pm 0.096$, or \item ``Sixty percent of consumers preferred the new blend. These results are expected to be accurate within 10 percentage points, 19 times out of 20." \end{itemize} \end{frame} \begin{frame} \frametitle{Meaning of the confidence interval} \begin{itemize} \item We calculated a 95\% confidence interval of $(0.504,0.696)$ for $\theta$. \item Does this mean $Pr\{ 0.504 < \theta < 0.696 \}=0.95$? \pause \item No! The quantities $0.504$, $0.696$ and $\theta$ are all constants, so $Pr\{ 0.504 < \theta < 0.696 \}$ is either zero or one. \pause \item The endpoints of the confidence interval are random variables, and the numbers $0.504$ and $0.696$ are \emph{realizations} of those random variables, arising from a particular random sample. \pause \item Meaning of the probability statement: If we were to calculate an interval in this manner for a large number of random samples, the interval would contain the true parameter around $95\%$ of the time. \pause \item So we sometimes say that we are ``$95\%$ confident" that $0.504 < \theta < 0.696$. \end{itemize} \end{frame} \begin{frame} \frametitle{Confidence intervals (regions) correspond to tests} \framesubtitle{Recall $Z_1 = \frac{\sqrt{n}(\overline{Y}-\theta_0)} {\sqrt{\theta_0(1-\theta_0)}}$ and $Z_2 = \frac{\sqrt{n} (\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}}$.} \pause From the derivation of the confidence interval, \begin{displaymath} -z_{\alpha/2} < Z_2 < z_{\alpha/2} \end{displaymath} if and only if \begin{displaymath} \overline{Y} - z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} < \theta_0 < \overline{Y} + z_{\alpha/2}\sqrt{\frac{\overline{Y}(1-\overline{Y})}{n}} \end{displaymath} \pause \begin{itemize} \item So the confidence interval consists of those parameter values $\theta_0$ for which $H_0: \theta=\theta_0$ is \emph{not} rejected. \pause \item That is, the null hypothesis is rejected at significance level $\alpha$ if and only if the value given by the null hypothesis is outside the $(1-\alpha)\times 100\%$ confidence interval. \pause \item There is a confidence interval corresponding to $Z_1$ too. \item In general, any test can be inverted to obtain a confidence region. \end{itemize} \end{frame} \begin{frame} \frametitle{Selecting sample size} \begin{itemize} \item Where did that $n=100$ come from? \item Probably off the top of someone's head. \item We can (and should) be more systematic. \pause \item Sample size can be selected \begin{itemize} \item To achieve a desired margin of error \item To achieve a desired statistical power \item In other reasonable ways \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Power} The power of a test is the probability of rejecting $H_0$ when $H_0$ is false. \pause \begin{itemize} \item More power is good. \pause \item Power is not just one number. It is a \emph{function} of the parameter(s). \pause \item Usually, \begin{itemize} \item For any $n$, the more incorrect $H_0$ is, the greater the power. \item For any parameter value satisfying the alternative hypothesis, the larger $n$ is, the greater the power. \end{itemize} \end{itemize} \end{frame} \begin{frame} \frametitle{Statistical power analysis} \framesubtitle{To select sample size} \pause \begin{itemize} \item Pick an effect you'd like to be able to detect -- a parameter value such that $H_0$ is false. It should be just over the boundary of interesting and meaningful. \pause \item Pick a desired power, a probability with which you'd like to be able to detect the effect by rejecting the null hypothesis. \pause \item Start with a fairly small $n$ and calculate the power. Increase the sample size until the desired power is reached. \end{itemize} \pause \vspace{5mm} There are two main issues. \vspace{5mm} \begin{itemize} \item What is an ``interesting" or ``meaningful" parameter value? \item How do you calculate the probability of rejecting $H_0$? \end{itemize} \end{frame} \begin{frame} \frametitle{Calculating power for the $Z_1$ test of a single proportion} \framesubtitle{True parameter value is $\theta$} \pause %\begin{displaymath} % Z_2 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}} %\end{displaymath} {\tiny \begin{eqnarray*} \mbox{Power} &=& 1-Pr\{-z_{\alpha/2} < Z_1 < z_{\alpha/2} \} \\ \pause &=& 1-Pr\left\{-z_{\alpha/2} < \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}} < z_{\alpha/2} \right\} \\ \pause && \\ &=& \ldots \\ \pause && \\ &=& 1-Pr\left\{\frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} - z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} ~~~<~~~ {\color{red}\frac{\sqrt{n}(\overline{Y}-\theta)}{\sqrt{\theta(1-\theta)}} } \right. \\ & & \hspace{8mm} < \left. \frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} + z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} \right\}\\ \pause &\approx& 1 - \Phi\left( \frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} + z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} \right) + \Phi\left( \frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} - z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} \right), \end{eqnarray*} } where $\Phi(\cdot)$ is the cumulative distribution function of the standard normal. \end{frame} \begin{frame}[fragile] \frametitle{An R function to calculate approximate power} \framesubtitle{For the $Z_1$ test of a single proportion} {\scriptsize \begin{displaymath} \mbox{Power} = 1 - \Phi\left( \frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} + z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} \right) + \Phi\left( \frac{\sqrt{n}(\theta_0-\theta)}{\sqrt{\theta(1-\theta)}} - z_{\alpha/2}\sqrt{\frac{\theta_0(1-\theta_0)}{\theta(1-\theta)}} \right) \end{displaymath} } {\footnotesize \begin{verbatim} Z1power = function(theta,n,theta0=0.50,alpha=0.05) { a = sqrt(n)*(theta0-theta)/sqrt(theta*(1-theta)) b = qnorm(1-alpha/2) * sqrt(theta0*(1-theta0)/(theta*(1-theta))) Z1power = 1 - pnorm(a+b) + pnorm(a-b) Z1power } # End of function Z1power > Z1power(theta=0.50,n=100) # Should be alpha = 0.05 [1] 0.05 \end{verbatim} } \end{frame} \begin{frame}[fragile] \frametitle{Some numerical examples} \begin{verbatim} > Z1power(0.55,100) [1] 0.168788 \end{verbatim} \pause \begin{verbatim} > Z1power(0.60,100) [1] 0.5163234 \end{verbatim} \pause \begin{verbatim} > Z1power(0.65,100) [1] 0.8621995 \end{verbatim} \pause \begin{verbatim} > Z1power(0.40,100) [1] 0.5163234 \end{verbatim} \pause \begin{verbatim} > Z1power(0.55,500) [1] 0.6093123 \end{verbatim} \pause \begin{verbatim} > Z1power(0.55,1000) [1] 0.8865478 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Find smallest sample size needed to detect $\theta=0.60$ as different from $\theta_0=0.50$ with probability at least 0.80} \begin{verbatim} > samplesize = 1; power=0 > while(power < 0.80) + { + samplesize = samplesize+1 + power = Z1power(theta=0.60,n=samplesize) + } > > samplesize [1] 194 > power [1] 0.8003138 \end{verbatim} \end{frame} \begin{frame}[fragile] \frametitle{Conclusions from the power analysis} \framesubtitle{Assuming true $\theta=0.60$} \begin{itemize} \item Power of 0.52 with $n=100$ is pathetic. \item As Fisher said, ``To call in the statistician after the experiment is done may be no more than asking him to perform a postmortem examination: he may be able to say what the experiment died of." \item $n=200$ is better. \begin{verbatim} > Z1power(theta=0.60,n=200) [1] 0.8122918 \end{verbatim} \item What sample size is required for power of 90\%? \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{What sample size is needed for a power of 90\%?} \begin{verbatim} > # Find sample size needed for power = 0.90 > samplesize = 1; power=0 > while(power < 0.90) + { + samplesize = samplesize+1 + power = Z1power(theta=0.60,n=samplesize) + } > > samplesize [1] 259 > power [1] 0.9005493 \end{verbatim} \end{frame} \begin{frame} \frametitle{What is required of the scientist} \framesubtitle{Who wants to select sample size by power analysis} The scientist must specify \begin{itemize} \item Parameter values that he or she wants to be able to detect as different from $H_0$ value. \item Desired power (probability of detection) \end{itemize} \vspace{20mm} It's not always easy for a scientist to think in terms of the parameters of a statistical model. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/appliedf13} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/appliedf13}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Z1power = function(theta,n,theta0=0.50,alpha=0.05) { a = sqrt(n)*(theta0-theta)/sqrt(theta*(1-theta)) b = qnorm(1-alpha/2) * sqrt(theta0*(1-theta0)/(theta*(1-theta))) Z1power = 1 - pnorm(a+b) + pnorm(a-b) Z1power } # End of function Z1power Z2power = function(theta,n,theta0=0.50,alpha=0.05) { effect = sqrt(n)*(theta0-theta)/sqrt(theta*(1-theta)) z = qnorm(1-alpha/2) Z2power = 1 - pnorm(effect+z) + pnorm(effect-z) Z2power } # End of function Z2power # Find sample size needed for power = 0.90 samplesize = 1; power = 0 while(power < 0.90) { samplesize = samplesize+1 power = Z2power(theta=0.60,n=samplesize) } samplesize power %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Notes and comments Cut out non-central chisq part, added nice what's a model, parameter space Non-central chisq is in 2012 version.