% Likelihood: Has some stuff from Applied Stat I % Notes and comments at the end % \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{AnnArbor} % CambridgeUS Blue and yellow, Shows current section title % \usetheme{Berlin} % Blue: Displays section titles on top % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides % \usetheme{Berkeley} \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Maximum Likelihood Part One\footnote{See last slide for copyright information.}} \subtitle{STA312 Spring 2019} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Background Reading} %\framesubtitle{} \begin{itemize} \item STA256/260 text on maximum likelihood. \item STA258 text or lecture slides on confidence intervals and hypothesis tests. \item Chapter One from \emph{Data analysis with SAS}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Background} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Statistical Estimation and Inference} \pause %\framesubtitle{} \begin{itemize} \item You want to learn from data. \pause \item Adopt a probability model for the data. \pause \item Often, pretend your data are sampled randomly from some population. \pause \item In rare cases, this may even be true. \pause \item What you wish you knew is represented by one or more \emph{unknown parameters}. \pause \item Estimate the parameters, or draw conclusions about the parameters.\pause \item Interpret the results in terms of the data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Examples of probability models} \pause \framesubtitle{Also called \emph{Statistical models}} \begin{itemize} \item Let $X_1, \ldots, X_n$ be a random sample from a normal distribution with expected value $\mu$ and variance $\sigma^2$. \pause \linebreak The parameters $\mu$ and $\sigma^2$ are unknown.\pause \item[] \item For $i=1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_i + \epsilon_i$, where \pause \begin{itemize} \item[] $\beta_0$ and $\beta_1$ are unknown constants. \pause \item[] $x_1, \ldots x_n$ are known, observable constants. \pause \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \pause \item[] $\sigma^2$ is an unknown constant. \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Meaning of the regression model} \pause %\framesubtitle{} {\small For $i=1, \ldots, n$, let $y_i = \beta_0 + \beta_1 x_i + \epsilon_i$, where \begin{itemize} \item[] $\beta_0$ and $\beta_1$ are unknown constants. \item[] $x_1, \ldots x_n$ are known, observable constants. \item[] $\epsilon_1, \ldots, \epsilon_n$ are independent $N(0,\sigma^2)$ random variables. \item[] The parameters $\beta_0, \beta_1, \sigma^2$ are unknown constants. \end{itemize} The regression model means \pause \begin{itemize} \item The predictor $x$ has a rough linear connection to the outcome $y$. \pause \item If $\beta_1>0$, low $x$ goes with low $y$ and high $x$ goes with high $y$. \pause \item If $\beta_1<0$, low $x$ goes with high $y$ and high $x$ goes with low $y$. \pause \item If $\beta_1=0$, then $x$ and $y$ are independent. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Maximum Likelihood} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Maximum Likelihood} \framesubtitle{Thank you Mr. Fisher} \pause \begin{itemize} \item Denote the unknown parameter by $\theta$. \pause \item How should we estimate $\theta$ based on the sample data? \pause \item Choose the value of $\theta$ that yields the greatest probability of getting the observed data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Likelihood} \framesubtitle{Assuming independent observations (a ``random sample")} \pause \begin{displaymath} L(\theta) = \prod_{i=1}^n p(y_i|\theta) \pause \mbox{ or } \prod_{i=1}^n f(y_i|\theta) \end{displaymath} \pause { \footnotesize \begin{itemize} \item The likelihood is the probability of obtaining the observed data \pause -- expressed as a function of the parameter. \pause \item If the assumed distribution of the data is discrete\pause, this statement is exactly correct. \pause \item If the assumed distribution of the data is continuous, the likelihood is roughly proportional to the probability of observing the data. \pause \item This is a standard calculus problem in maximizing a function. \pause \item It is usually more convenient to maximize the natural log of the likelihood. \pause \item The answer is the same because $\log(x)$ is an increasing function. \pause \item The greater the likelihood, the greater the log likelihood. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Mechanics} \framesubtitle{Really basic math} \pause I have noticed that a major obstacle for many students when doing maximum likelihood calculations is a set of basic mathematical operations they actually know. But the mechanics are rusty, or the notation used in statistics is troublesome. So, with sincere apologies to those who don't need this, here are some basic rules. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The distributive law} \pause %\framesubtitle{} $a(b+c)=ab+ac$. \pause You may see this in a form like \pause \begin{displaymath} \theta \sum_{i=1}^n x_i = \sum_{i=1}^n \theta x_i \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Power of a product is the product of powers} \pause %\framesubtitle{} $(ab)^c = a^c \, b^c$. \pause You may see this in a form like \pause \begin{displaymath} \left(\prod_{i=1}^n x_i\right)^\alpha = \prod_{i=1}^n x_i^\alpha \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Multiplication is addition of exponents} \pause %\framesubtitle{} $a^b a^c = a^{b+c}$. \pause You may see this in a form like \pause \begin{displaymath} \prod_{i=1}^n \theta e^{-\theta x_i} = \theta^n \exp(-\theta \sum_{i=1}^n x_i) \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Powering is multiplication of exponents} \pause %\framesubtitle{} $(a^b)^c = a^{bc}$. \pause You may see this in a form like \pause \begin{displaymath} (e^{\mu t + \frac{1}{2}\sigma^2 t^2})^n = e^{n\mu t + \frac{1}{2}n\sigma^2 t^2} \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Log of a product is sum of logs} \framesubtitle{$\log$ means \emph{natural} log, base $e$, possibly denoted $\ln$ on your calculator} \pause $\log(ab) = \log(a)+\log(b)$. \pause You may see this in a form like \pause \begin{displaymath} \log \prod_{i=1}^n x_i = \sum_{i=1}^n \log x_i \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Log of a power is the exponent times the log} \pause %\framesubtitle{} $\log(a^b)=b\,\log(a)$. \pause You may see this in a form like \pause \begin{displaymath} \log(\theta^n) = n \log \theta \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The log is the inverse of the exponential function} \pause %\framesubtitle{} $\log(e^a) = a$. \pause You may see this in a form like \pause \begin{displaymath} \log\left( \theta^n \exp(-\theta \sum_{i=1}^n x_i) \right) = n \log \theta - \theta \sum_{i=1}^n x_i \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Example: Coffee taste test} \pause A fast food chain is considering a change in the blend of coffee beans they use to make their coffee. To determine whether their customers prefer the new blend, the company plans to select a random sample of $n=100$ coffee-drinking customers and ask them to taste coffee made with the new blend and with the old blend, in cups marked ``$A$" and ``$B$." Half the time the new blend will be in cup $A$, and half the time it will be in cup $B$. Management wants to know if there is a difference in preference for the two blends. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Statistical model} Letting $\theta$ denote the probability that a consumer will choose the new blend, treat the data $Y_1, \ldots, Y_n$ as a random sample from a Bernoulli distribution. \pause That is, independently for $i=1, \ldots, n$, \pause \begin{displaymath} p(y_i|\theta) = \theta^{y_i} (1-\theta)^{1-y_i} \end{displaymath} \pause for $y_i=0$ or $y_i=1$, and zero otherwise. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Find the MLE of $\theta$}{Show your work} \pause Denoting the likelihood by $L(\theta)$ and the log likelihood by $\ell(\theta) = \log L(\theta)$, maximize the log likelihood. \pause \begin{eqnarray*} \frac{\partial\ell}{\partial\theta} \pause & = & \frac{\partial}{\partial\theta} \log\left(\prod_{i=1}^n p(y_i|\theta) \right) \\ \pause & = & \frac{\partial}{\partial\theta} \log\left(\prod_{i=1}^n \theta^{y_i} (1-\theta)^{1-y_i} \right) \\ \pause & = & \frac{\partial}{\partial\theta} \log\left(\theta^{\sum_{i=1}^n y_i} (1-\theta)^{n-\sum_{i=1}^n y_i}\right) \\ \pause & = & \frac{\partial}{\partial\theta}\left((\sum_{i=1}^n y_i)\log\theta + (n-\sum_{i=1}^n y_i)\log (1-\theta) \right) \\ \pause & = & \frac{\sum_{i=1}^n y_i}{\theta} - \frac{n-\sum_{i=1}^n y_i}{1-\theta} \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}{Setting the derivative to zero and solving} \pause \begin{itemize} \item $\theta = \frac{\sum_{i=1}^n y_i}{n} = \overline{y}$ \pause \item Second derivative test: $ \frac{\partial^2\log \ell}{\partial\theta^2} = -n\left(\frac{1-\overline{y}}{(1-\theta)^2} + \frac{\overline{y}}{\theta^2} \right) < 0$ \pause \item Concave down, maximum, and the MLE is the sample proportion: \pause $\widehat{\theta} = \overline{y} = p$ \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] % Note use of fragile to make verbatim work. \frametitle{Numerical estimate} Suppose 60 of the 100 consumers prefer the new blend. Give a point estimate the parameter $\theta$. Your answer is a number. \vspace{10mm} \pause \begin{verbatim} > p = 60/100; p [1] 0.6 \end{verbatim} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Minus log likelihood measures lack of model fit} \pause %\framesubtitle{} \begin{itemize} \item $-\ell(\theta) = -\log\prod_{i=1}^n p(x_i|\theta) \pause = \sum_{i=1}^n -\log p(x_i|\theta)$ \pause \item The best fit for observation $x_i$ is if $p(x_i|\theta) = P(X_i=x_i|\theta) = 1$. \pause \item Then the log is zero. \pause \item If $p(x_i|\theta) < 1$, then $\log p(x_i|\theta)$ is negative and $-\log p(x_i|\theta)$ is positive. \pause \item The lower the probability (bad fit), the greater $-\log p(x_i|\theta)$ becomes. \pause \item So maximum likelihood is minimizing the total (or average) badness of fit. \pause \item In machine learning, the minus log likelihood would be called a \emph{loss function}. \pause \item And estimating $\theta$ by minimizing the loss function would be called \emph{learning} about $\theta$. % \pause % \item Very strange. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Confidence Intervals and Tests} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Large-sample Normality} \framesubtitle{Leading to confidence intervals and tests} \pause \begin{itemize} \item For the taste test example, have MLE $\widehat{\theta} = \overline{y}$, the sample mean. \pause \item The Central Limit Theorem says %\footnote{Okay, it actually says something else, but this is the practical meaning.} that if $y_1, \ldots, y_n$ are independent random variables from a distribution with expected value $\mu$ and variance $\sigma^2$, then \pause \item The distribution of $\overline{y}_n$ is approximately normal for large samples. \pause \item Regardless of sample size, $E(\overline{y}_n) = \mu$ and $Var(\overline{y}_n) = \frac{\sigma^2}{n}$. \pause \item Here, the data are Bernoulli, with $\mu=\theta$ \pause and $\sigma^2 = \theta(1-\theta)$. \pause \item Write \begin{displaymath} \widehat{\theta}_n \stackrel{.}{\sim} N\left(\theta,\frac{\theta(1-\theta)}{n}\right). \end{displaymath} \pause \item Vocabulary: $\widehat{\theta}_n$ is ``asymptotically normal," \pause with asymptotic mean $\theta$ \pause and asymptotic variance $\theta(1-\theta)/n$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Large-sample Normality} \framesubtitle{Still for the taste test example} \pause \begin{itemize} \item $\widehat{\theta}_n \stackrel{.}{\sim} N\left(\theta,\frac{\theta(1-\theta)}{n}\right)$. \pause \item This means $Z_n = \frac{\widehat{\theta}_n - \theta}{\sqrt{\frac{\theta(1-\theta)}{n}}} \stackrel{.}{\sim} N(0,1)$. \pause \item Also, $Z_n = \frac{\widehat{\theta}_n - \theta}{\sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}}} \stackrel{.}{\sim} N(0,1)$. \pause \item In general, substitute the MLE for the parameter in the formula for the variance, and the Central limit Theorem still holds. \pause \item Substituting the sample variance $s^2$ for $\sigma^2$ also works. \pause \end{itemize} \begin{displaymath} s^2 = \frac{\sum_{i=1}^n(y_i-\overline{y})^2}{n-1} \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Getting the picture} %\framesubtitle{} \begin{center} \includegraphics[width=4in]{NormalCurve} \end{center}\pause {\small $Z_n = \frac{\widehat{\theta}_n - \theta}{\sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}}} \stackrel{.}{\sim} N(0,1)$ means $P\{-z_{\alpha/2} < Z_n < z_{\alpha/2} \approx 1-\alpha \}$. } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Confidence interval using $Z_n = \frac{\widehat{\theta}_n - \theta}{\sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}}} \stackrel{.}{\sim} N(0,1)$} \pause %\framesubtitle{Using } {\small \begin{eqnarray*} 1-\alpha & \approx & P\{-z_{\alpha/2} < Z_n < z_{\alpha/2} \} \\ \pause & = & P\{-z_{\alpha/2} < \frac{\widehat{\theta}_n - \theta}{\sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}}} < z_{\alpha/2} \} \\ \pause & = & P\{-z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} < \widehat{\theta}_n - \theta < z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} \\ \pause & = & P\{- \widehat{\theta}_n - z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} < - \theta < -\widehat{\theta}_n + z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} \\ \pause & = & P\{\widehat{\theta}_n + z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} > \theta > \widehat{\theta}_n - z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} \\ \pause & = & P\{\widehat{\theta}_n - z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} < \theta < \widehat{\theta}_n + z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} \\ \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] % Note use of fragile to make verbatim work. \frametitle{Numerical confidence interval for the taste test} \framesubtitle{Using $1-\alpha \approx P\{\widehat{\theta}_n - z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}} < \theta < \widehat{\theta}_n + z_{\alpha/2} \sqrt{\frac{\widehat{\theta}_n(1-\widehat{\theta}_n)}{n}}$} \pause {\color{blue} \begin{verbatim} > thetahat = 60/100; n = 100 > zcrit = qnorm(0.975); zcrit \end{verbatim} \pause } % End color \begin{verbatim} [1] 1.959964 \end{verbatim} \pause {\color{blue} \begin{verbatim} > se = sqrt(thetahat*(1-thetahat)/n) > c(thetahat - zcrit*se, thetahat + zcrit*se) \end{verbatim} \pause } % End color \begin{verbatim} [1] 0.5039818 0.6960182 \end{verbatim} \pause Confidence interval is $ \widehat{\theta} ~ \pm$ $z_{\alpha/2}$ $\times$ standard error. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Tests of statistical hypotheses} %\framesubtitle{} \pause \begin{itemize} \item Model: $y \sim F_\theta $ \pause \item $y$ is the data vector, and $\mathcal{Y}$ is the sample space: $y \in \mathcal{Y}$ \pause \item $\theta$ is the parameter, and $\Theta$ is the parameter space: $\theta \in \Theta$ \pause \item Null hypothesis is $H_0: \theta \in \Theta_0 \mbox{ v.s. } H_1: \theta \in \Theta \cap \Theta_0^c$. \pause \item Meaning of the \emph{null} hypothesis is that \emph{nothing} interesting is happening. \pause \item $\mathcal{C} \subset \mathcal{Y}$ is the \emph{critical region}. Reject $H_0$ in favour of $H_A$ when $y \in \mathcal{C}$. \pause \item Significance level $\alpha$ (\emph{size} of the test) is the maximum probability of rejecting $H_0$ when $H_0$ is true. \pause Conventionally, $\alpha=0.05$. \pause \item $p$-value is the smallest value of $\alpha$ for which $H_0$ can be rejected. \pause \item Small $p$-values are interpreted as providing stronger evidence against the null hypothesis. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Carry out a test to determine which brand of coffee is preferred} \framesubtitle{Recall the model is $y_1, \ldots, y_n \stackrel{i.i.d.}{\sim} B(1,\theta)$} \pause Start by stating the null hypothesis. \pause \begin{itemize} \item $H_0: \theta=0.50$ \item $H_1: \theta \neq 0.50$ \pause \item Could you make a case for a one-sided test? \pause \item $\alpha=0.05$ as usual. \pause \item Reject $H_0$ if $p < 0.05$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Several valid test statistics for $H_0: \theta=\theta_0$ are available} \framesubtitle{Based on $\overline{y} \stackrel{.}{\sim} N(\theta,\frac{\theta(1-\theta)}{n})$} \pause Two of them are % Which one do you like more? Why? \begin{displaymath} Z_1 = \frac{\sqrt{n}(\overline{y}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}} \end{displaymath} and \pause \begin{displaymath} Z_2 = \frac{\sqrt{n}(\overline{y}-\theta_0)}{\sqrt{\overline{y}(1-\overline{y})}} \end{displaymath} \vspace{5mm} \pause What is the critical value? Your answer is a number. \pause \begin{verbatim} > alpha = 0.05 > qnorm(1-alpha/2) [1] 1.959964 \end{verbatim} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Calculate the test statistic and the $p$-value for each test} \framesubtitle{Suppose 60 out of 100 preferred the new blend} \pause $ Z_1 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\theta_0(1-\theta_0)}}$ \pause \begin{verbatim} > theta0 = .5; ybar = .6; n = 100 > Z1 = sqrt(n)*(ybar-theta0)/sqrt(theta0*(1-theta0)); Z1 [1] 2 > pval1 = 2 * (1-pnorm(Z1)); pval1 [1] 0.04550026 \end{verbatim} \pause $Z_2 = \frac{\sqrt{n}(\overline{Y}-\theta_0)}{\sqrt{\overline{Y}(1-\overline{Y})}}$ \pause \begin{verbatim} > Z2 = sqrt(n)*(ybar-theta0)/sqrt(ybar*(1-ybar)); Z2 [1] 2.041241 > pval2 = 2 * (1-pnorm(Z2)); pval2 [1] 0.04122683 \end{verbatim} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Conclusions} %\framesubtitle{In symbols and words: Words are more important} \begin{itemize} \item Do you reject $H_0$? \pause \emph{Yes, just barely.} \pause \item Isn't the $\alpha=0.05$ significance level pretty arbitrary? \pause \linebreak \emph{Yes, but if people insist on a Yes or No answer, this is what you give them.} \pause \item What do you conclude, in symbols? \pause $\theta \neq 0.50$. \emph{Specifically,} $\theta > 0.50$. \pause \item What do you conclude, in plain language? Your answer is a statement about coffee. \pause \emph{More consumers prefer the new blend of coffee beans.} \pause \item Can you really draw directional conclusions when all you did was reject a non-directional null hypothesis? \pause \emph{Yes.} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A technical issue} %\framesubtitle{} { \small \begin{itemize} \item In this class we will mostly avoid one-tailed tests. \pause \item Why? Ask what would happen if the results were strong and in the opposite direction to what was predicted (dental example). \pause \item But when $H_0$ is rejected, we still draw directional conclusions. \pause \item For example, if $x$ is income and $y$ is credit card debt, we test $H_0: \beta_1=0$ with a two-sided $t$-test. \pause \item Say $p = 0.0021$ and $\widehat{\beta}_1 = 1.27$. \pause We say ``Consumers with higher incomes tend to have more credit card debt." \pause \item Is this justified? We'd better hope so, or all we can say is ``There is a connection between income and average credit card debt." \pause \item Then they ask: ``What's the connection? Do people with lower income have more debt?" \pause \item And you have to say ``Sorry, I don't know." \pause \item It's a good way to get fired, or at least look silly. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The technical resolution} %\framesubtitle{} Decompose the two-sided test into a set of two one-sided tests with significance level $\alpha/2$, equivalent to the two-sided test. \end{frame} \begin{frame} \frametitle{Two-sided test} %\framesubtitle{} \begin{center} {\Large $H_0: \theta=\frac{1}{2}$ versus $H_1: \theta \neq \frac{1}{2}$, $\alpha=0.05$ } \vspace{10mm} \includegraphics[width=4.5in]{bothtails} \end{center} \end{frame} \begin{frame} \frametitle{Left-sided test} %\framesubtitle{} \begin{center} {\Large $H_0: \theta\geq \frac{1}{2}$ versus $H_1: \theta < \frac{1}{2}$, $\alpha=0.025$ } \vspace{10mm} \includegraphics[width=4.5in]{lefttail} \end{center} \end{frame} \begin{frame} \frametitle{Right-sided test} %\framesubtitle{} \begin{center} {\Large $H_0: \theta\leq \frac{1}{2}$ versus $H_1: \theta > \frac{1}{2}$, $\alpha=0.025$ } \vspace{10mm} \includegraphics[width=4.5in]{righttail} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Decomposing the 2-sided test into two 1-sided tests} \pause %\framesubtitle{} \begin{center} \begin{tabular}{lc} \raisebox{0.17in}{\small $H_0: \theta=\frac{1}{2}$ vs. $H_1: \theta \neq \frac{1}{2}$, $\alpha=0.05$} & \includegraphics[width=1.5in]{bothtails} \\ \raisebox{0.17in}{\small $H_0: \theta\geq \frac{1}{2}$ vs. $H_1: \theta < \frac{1}{2}$, $\alpha=0.025$} & \includegraphics[width=1.5in]{lefttail} \\ \raisebox{0.17in}{\small $H_0: \theta\leq \frac{1}{2}$ versus $H_1: \theta > \frac{1}{2}$, $\alpha=0.025$} & \includegraphics[width=1.5in]{righttail} \\ \end{tabular} \end{center} \pause \begin{itemize} \item Clearly, the 2-sided test rejects $H_0$ if and only if exactly \emph{one} of the 1-sided tests reject $H_0$. \pause \item Carry out \emph{both} of the one-sided tests. \pause \item Draw a directional conclusion if $H_0$ is rejected. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{That was a review of confidence intervals and tests} %\framesubtitle{} {\LARGE Getting back to maximum likelihood,} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Continuous Models} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Continuous Random Variable $X$} \pause %\framesubtitle{} \begin{itemize} \item Probability is area under a curve. \pause \begin{center} \includegraphics[width=2in]{area} \end{center} \pause \item The curve is called the \emph{probability density function}. \pause \item It is denoted by $f(x)$ or $f_x(x)$. \pause \item $P(X \leq x) = F(x)$ or $F_x(x)$ is the \emph{cumulative distribution function}. \pause \begin{center} \includegraphics[width=2in]{bigF} \end{center} \pause \item $\frac{d}{dx}F(x) = f(x)$\pause \item And $F(x) = \int_{-\infty}^x f(t) \, dt$ \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$f(x) = \frac{d}{dx}F(x)$ is not a probability} \framesubtitle{Recall $g^\prime(x) = \lim_{h \rightarrow 0} \frac{g(x+h)-g(x)}{h}$} \pause \begin{displaymath} f(x) = \lim_{h \rightarrow 0} \frac{F(x+h)-F(x)}{h} \end{displaymath} \begin{center} \includegraphics[width=2in]{slopeA} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another way to write $f(x)$} \framesubtitle{Instead of $\lim_{h \rightarrow 0} \frac{F(x+h)-F(x)}{h}$} \pause \begin{displaymath} f(x) = \lim_{h \rightarrow 0} \frac{F(x+\frac{h}{2})-F(x-\frac{h}{2})}{h} \end{displaymath} \begin{center} \includegraphics[width=2in]{slopeB} \end{center} \pause Limiting slope is the same if it exists. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Interpretation} %\framesubtitle{} {\Large \begin{displaymath} f(x) = \lim_{h \rightarrow 0} \frac{F(x+\frac{h}{2})-F(x-\frac{h}{2})}{h} \end{displaymath} \pause \vspace{5mm} \begin{itemize} \item $F(x+\frac{h}{2})-F(x-\frac{h}{2}) = P(x-\frac{h}{2} < X < x+\frac{h}{2})$ \pause \item[] \item So $f(x)$ is roughly proportional to the probability that $X$ is in a tiny interval surrounding $x$. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example: Exponential data} \pause %\framesubtitle{} \begin{itemize} \item The lifetime of an electronic component has an exponential distribution with parameter $\lambda>0$. \pause \item That is, $f(x|\lambda) = \lambda e^{-\lambda x}$ for $x>0$, and zero for $x \leq 0$. \pause \item Let $X_1, \ldots X_n$ be a random sample of lifetimes. \pause \item What is the likelihood function? Simplify. \pause \begin{displaymath} L(\lambda) = \pause \prod_{i=1}^n \lambda e^{-\lambda x_i} \pause = \lambda^n e^{-\lambda \sum_{i=1}^n x_i} \end{displaymath} \pause \item Note that $x_1, \ldots, x_n$ are the observed data values. \pause \item The likelihood is roughly proportional to the probability of obtaining a set of data values in a tiny neighbourhood of the observed sample data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Find the MLE} \pause \framesubtitle{Differentiate the log likelihood} \begin{eqnarray*} \frac{d}{d\lambda}\ell(\lambda)& = & \frac{d}{d\lambda}\log L(\lambda) \\ \pause & = & \frac{d}{d\lambda}\log\left( \lambda^n e^{-\lambda \sum_{i=1}^n x_i} \right) \\ \pause & = & \frac{d}{d\lambda}\left( n\log\lambda -\lambda \sum_{i=1}^n x_i \right) \\ \pause & = & \frac{n}{\lambda} - \sum_{i=1}^n x_i \pause \stackrel{set}{=} 0 \\ \pause & \Rightarrow & \lambda = \frac{n}{\sum_{i=1}^n x_i} \end{eqnarray*} \pause So $\widehat{\lambda} =\frac{n}{\sum_{i=1}^n x_i} =1/\overline{x}$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Large Samples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Large-sample normality of the MLE} \pause %\framesubtitle{} \begin{itemize} \item For the coffee taste test (Bernoulli) example, the MLE $\widehat{\theta}$ was approximately normal \pause because (in that example) $\widehat{\theta} = \overline{y}$, \pause and the Central Limt Theorem says $\overline{y}$ is approximately normal for large samples. \pause \item But the result holds more generally. \pause \item Under some technical conditions that are satisfied in this class, the distribution of the maximum likelihood estimate is approximately normal for large samples. \pause \item The distribution of \emph{vectors} of parameters is approximately multivariate normal. \pause \item Thank you, Mr. Wald. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A Central Limit Theorem for the MLE} \framesubtitle{Based indirectly on the usual Central Limit Theorem} \pause \begin{displaymath} \widehat{\theta}_n \stackrel{.}{\sim} N(\theta,\frac{1}{n \, I(\theta)}) \end{displaymath} \pause Where $I(\theta)$ is the \emph{Fisher Information} in one observation. \pause \begin{displaymath} I(\theta) = E \frac{\partial^2}{\partial\theta^2} -\log f(X|\theta) \pause = -E \frac{\partial^2}{\partial\theta^2} \log f(X|\theta) \end{displaymath} \pause Here's the idea. \pause \begin{itemize} \item You are finding the MLE by \emph{minimizing} the \emph{minus} log likelihood function. \pause \item And doing the second derivative test to see if it's really a minimum. \pause \item But the likelihood is a random quantity, because the $X_i$ values are random variables. \pause \item So take the expected value. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Fisher Information in the whole sample} \framesubtitle{$I(\theta) = -E \frac{\partial^2}{\partial\theta^2} \log f(X|\theta)$ is the information in one observation.} \pause %{\small \begin{eqnarray*} -E\frac{\partial^2}{\partial\theta^2} \log L(\theta) \pause & = & -E\frac{\partial^2}{\partial\theta^2} \log \prod_{i=1}^n f(X_i|\theta) \\ \pause & = & -E\frac{\partial^2}{\partial\theta^2} \sum_{i=1}^n \log f(X_i|\theta) \\ \pause & = & \sum_{i=1}^n -E\frac{\partial^2}{\partial\theta^2} \log f(X_i|\theta) \\ \pause &= & n \, I(\theta) \end{eqnarray*} %} % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Variance and curvature of the log likelihood} \pause %\framesubtitle{} { \small \begin{itemize} \item Fisher observed that some likelihood functions are almost flat at the MLE, while others have a lot of curvature (big second derivative). \pause \item Likelihoods with more curvature contain more information about the location of the parameter. \pause \item The Fisher information in the whole sample \pause (that's $nI(\theta)$) \pause is \pause the expected curvature of the minus log likelihood, at the true parameter value. \pause \item Fisher's great insight was that the curvature is deeply related to the variance of the MLE. \pause \item The more the curvature, the smaller the variance. \pause \item The asymptotic variance of the MLE is $v_n = \frac{1}{nI(\theta)}$. \pause \item For many examples it's exactly the variance. \pause \item Fisher discovered this. Wald proved asymptotic normality under general conditions. \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimating the asymptotic variance $v_n = \frac{1}{nI(\theta)}$} \pause % \framesubtitle{} \begin{itemize} \item For tests and confidence intervals, we need to \emph{estimate} the asymptotic variance of the MLE. \pause \item There are (at least) two good ways. \pause \item The first is to use $\frac{1}{nI(\widehat{\theta)}}$. \pause \item The other estimate is based on the Fisher information in the whole sample: \pause $nI(\theta) = -E\frac{\partial^2}{\partial\theta^2} \log L(\theta) \pause = -E\frac{\partial^2}{\partial\theta^2} \ell(\theta)$. \pause \item Instead of calculating the expected value and then substituting $\theta=\widehat{\theta}$, \pause just substitute $\theta=\widehat{\theta}$ in the first place. \pause \item The result is sometimes called the \emph{observed} Fisher information: \pause \begin{displaymath} \widehat{nI(\theta)} = \left. -\frac{\partial^2}{\partial\theta^2} \log L(\theta) \right|_{\theta=\widehat{\theta}} \pause = -\ell^{\prime\prime}\left(\widehat{\theta}\right) \end{displaymath} \pause \item Often, the two estimates are identical. They are always close for large samples. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Observed Fisher Information: $-\ell^{\prime\prime}(\widehat{\theta})$} \framesubtitle{The second derivative of the minus log likelihood, evaluated at the MLE} \pause We now have a convenient recipe for the standard error (estimated standard deviation) of the MLE. \pause \begin{itemize} \item Differentiate the log likelihood function and set to zero; solve for the MLE. \pause \item Carry out the second derivative test to make sure it's a maximum. \pause \item That is, differentiate again and substitute $\theta=\widehat{\theta}$. \pause \item Multiply by minus one and invert it (one over). That's the estimated variance of the MLE. \pause \item Take the square root, and you have the standard error. \end{itemize} \begin{displaymath} S_{\widehat{\theta}} = \frac{1}{\sqrt{-\ell^{\prime\prime}(\widehat{\theta})}} \end{displaymath} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What do you need to be able to do?} \pause %\framesubtitle{} Given a model, and a set of numerical data, \pause \begin{itemize} \item Derive a formula for $\widehat{\theta}$. \pause \item Calculate a numerical point estimate of $\theta$ from the sample data. \pause The answer is a number. \pause \item Give a formula for the estimated variance of $\widehat{\theta}$. \pause Use $\widehat{v}_n = 1/-\ell^{\prime\prime}(\widehat{\theta})$. \pause \item Calculate a 95\% confidence interval for $\theta$. \pause \\ Use 1.96 $\pm$ $z_{\alpha/2}$ $\times$ standard error. \pause The answer is a pair of numbers. \pause \item Test $H_0:\theta = \theta_0$. \pause Use \begin{displaymath} Z_n = \frac{\widehat{\theta}-\theta_0}{\sqrt{\widehat{v}_n}}. \end{displaymath} \pause \item We need an example. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/312s19} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/312s19}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} \pause %\framesubtitle{} \begin{itemize} \item \pause \item \pause \item \end{itemize} \end{frame}