\documentclass[serif]{beamer} % Serif for Computer Modern math font. % \documentclass[serif, handout]{beamer} % Handout to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Introduction to Bayesian Statistics\footnote{ This slide show is an open-source document. See last slide for copyright information.}} \subtitle{STA 2453: Winter 2016} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} \begin{frame} \frametitle{Thomas Bayes (1701-1761)} \framesubtitle{Image from the Wikipedia} \begin{center} \includegraphics[width=2.8in]{Thomas_Bayes} \end{center} \end{frame} \section{Probability} \begin{frame} \frametitle{Bayes' Theorem} %\framesubtitle{} \begin{itemize} \item Bayes' Theorem is about conditional probability. \item It has statistical applications. \end{itemize} \end{frame} \begin{frame} \frametitle{Conditional Probability} %\framesubtitle{} \begin{center} \includegraphics[width=3in]{Venn} \end{center} \begin{displaymath} P(A|B) = \frac{P(A\cap B)}{P(B)} \end{displaymath} \end{frame} \begin{frame} \frametitle{Multiplication Rule}\pause %{\Large From $P(A|B) = \frac{P(A\cap B)}{P(B)}$, \pause get $P(A\cap B) = P(A|B)P(B)$. \pause \vspace{10mm} From $P(B|A) = \frac{P(B\cap A)}{P(A)}$, \pause get $P(A\cap B) = P(B|A)P(A)$ %} % End size \end{frame} \begin{frame} \frametitle{Bayes' Theorem} \framesubtitle{The most elementary version} \pause \begin{center} \includegraphics[width=1.5in]{Venn} \end{center} \pause \begin{eqnarray*} P(A|B) & = & \frac{P(A\cap B)}{P(B)} \\ \pause & = & \frac{P(A\cap B)}{P(A\cap B) + P(A^c\cap B)} \\ \pause & = & \frac{P(B|A)P(A)}{P(B|A)P(A) + P(B|A^c)P(A^c)} \\ \end{eqnarray*} \end{frame} \begin{frame} \frametitle{Define ``events" in terms of random variables} \framesubtitle{Instead of $A$, $B$, etc.} \pause {\LARGE \begin{displaymath} P(Y=y|X=x) = \frac{P(X=x,Y=y)}{P(X=x)} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{For continuous random variables} \pause We have conditional densities: \vspace{10mm} \pause %\framesubtitle{} {\LARGE \begin{displaymath} f_{y|x}(y|x) = \frac{f_{xy}(x,y)}{f_x(x)} \end{displaymath} } % End size \end{frame} \begin{frame} \frametitle{There are many versions of Bayes' Theorem} \pause %\framesubtitle{} For discrete random variables, \vspace{10mm} \pause {\Large \begin{eqnarray*} P(X=x|Y=y) & = & \frac{P(X=x,Y=y)}{P(Y=y)} \\ \pause && \\ & = & \frac{P(Y=y|X=x)P(X=x)}{\sum_t P(Y=y|X=t)P(X=t)} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{For continuous random variables} \pause %\framesubtitle{} {\LARGE \begin{eqnarray*} f_{x|y}(x|y) & = & \frac{f_{xy}(x,y)}{f_y(y)} \\ \pause && \\ & = & \frac{f_{y|x}(y|x)f_x(x)}{\int f_{y|x}(y|t)f_x(t) \, dt} \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Compare} %\framesubtitle{} %{\LARGE \begin{eqnarray*} P(A|B) & = & \frac{P(B|A)P(A)}{P(B|A)P(A) + P(B|A^c)P(A^c)} \\ \pause && \\ P(X=x|Y=y) & = & \frac{P(Y=y|X=x)P(X=x)}{\sum_t P(Y=y|X=t)P(X=t)} \\ \pause && \\ f_{x|y}(x|y) & = & \frac{f_{y|x}(y|x)f_x(x)}{\int f_{y|x}(y|t)f_x(t) \, dt} \end{eqnarray*} %} % End size \end{frame} \section{Philosopy} \begin{frame} \frametitle{Philosophy} \framesubtitle{Bayesian versus Frequentist} \pause \begin{itemize} \item What is probability? \pause \item Probability is a formal axiomatic system (Thank you Mr. Kolmogorov). \pause \item \emph{Of what} is probability a model? \end{itemize} \end{frame} \begin{frame} \frametitle{\emph{Of what} is probability a model?} \framesubtitle{Two answers} \pause \begin{itemize} \item Frequentist: Probability is long-run relative frequency. \pause \item Bayesian: Probability is degree of subjective belief. \end{itemize} \end{frame} \begin{frame} \frametitle{Statistical inference} \framesubtitle{How it works} \pause \begin{itemize} \item Adopt a probability model for data $X$. \pause \item Distribution of $X$ depends on a parameter $\theta$. \pause \item Use observed value $X=x$ to decide about $\theta$. \pause \item Translate the decision into a statement about the process that generated the data. \pause \item Bayesians and Frequentists agree so far, mostly. \pause \item The description above is limited to what frequentists can do. \pause \item Bayes methods can generate more specific recommendations. \end{itemize} \end{frame} \begin{frame} \frametitle{What is parameter?} \pause %\framesubtitle{} \begin{itemize} \item To the frequentist, it is an unknown constant. \pause \item To the Bayesian since we don't know the value of the parameter, it's a random variable. \end{itemize} \end{frame} \begin{frame} \frametitle{Unknown parameters are random variables} \framesubtitle{To the Bayesian} \pause \begin{itemize} \item That's because probability is subjective belief. \pause \item We model our uncertainty with a probability distribution, \pause $\pi(\theta)$. \pause \item $\pi(\theta)$ is called the \emph{prior} distribution. \pause \item Prior because it represents the statistician's belief about $\theta$ \emph{before} observing the data. \pause \item The distribution of $\theta$ after seeing the data is called the \emph{posterior} distribution. \pause \item The posterior is the conditional distribution of the parameter given the data. \end{itemize} \end{frame} \begin{frame} \frametitle{Bayesian Inference} \pause %\framesubtitle{} \begin{itemize} \item Model is $p(x|\theta)$ or $f(x|\theta)$. \pause \item Prior distribution $\pi(\theta)$ is based on the best available information. \pause \item But yours might be different from mine. It's subjective. \pause \item Use Bayes' Theorem to obtain the posterior distribution $\pi(\theta|x)$. \pause \item As the notation indicates, $\pi(\theta|x)$ might be the prior for the next experiment. \end{itemize} \end{frame} \begin{frame} \frametitle{Subjectivity} %\framesubtitle{} \begin{itemize} \item Subjectivity is the most frequent objection to Bayesian methods. \pause \item The prior distribution influences the conclusions. \pause \item Two scientists may arrive at different conclusions from the same data, \emph{based on the same statistical analysis}. \pause \item The influence of the prior goes to zero as the sample size increases \pause \item For all but the most bone-headed priors. \end{itemize} \end{frame} \begin{frame} \frametitle{Bayes' Theorem} \pause \framesubtitle{Continuous case} {\LARGE \begin{eqnarray*} \pi(\theta|x) & = & \frac{f(x|\theta)\pi(\theta)}{\int f(x|t) \pi(t) \, dt} \\ \pause && \\ & \propto & f(x|\theta)\pi(\theta) \end{eqnarray*} } % End size \end{frame} \begin{frame} \frametitle{Once you have the posterior distribution, you can \ldots} \pause %\framesubtitle{} \begin{itemize} \item Give a point estimate of $\theta$. \pause Why not $E(\theta|X=x)$? \pause \item Test hypotheses, like $H_0: \theta \in H$. \pause \item Reject $H_0$ if $P(\theta \in H|X=x)0$ and $\beta>0$ \end{itemize} \end{frame} \begin{frame} \frametitle{Beta prior: $\pi(\theta) = \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta^{\alpha-1} (1-\theta)^{\beta-1}$} %\framesubtitle{} \begin{itemize} \item Supported on $[0,1]$. \pause \item $E(\theta) = \frac{\alpha}{\alpha+\beta}$ \pause \item $Var(\theta) = \frac{\alpha\beta}{(\alpha+\beta)^2(\alpha+\beta+1)}$. \pause \item Can assume a variety of shapes depending on $\alpha$ and $\beta$. \pause \item When $\alpha=\beta=1$, it's uniform. \pause \item Bayes used a Bernoulli model and a uniform prior in his posthumous paper. \end{itemize} \end{frame} \begin{frame} \frametitle{Posterior distribution} \pause %\framesubtitle{} %{\LARGE \begin{eqnarray*} \pi(\theta|x) & \propto & p(x|\theta)~\pi(\theta) \\ \pause && \\ &=& \theta^{\sum_{i=1}^nx_i} (1-\theta)^{n-\sum_{i=1}^nx_i} ~ \frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} \theta^{\alpha-1} (1-\theta)^{\beta-1} \\ \pause && \\ & \propto & \theta^{(\alpha + \sum_{i=1}^nx_i) - 1} (1-\theta)^{(\beta + n-\sum_{i=1}^nx_i) - 1} \\ \pause \end{eqnarray*} Proportional to the density of a Beta$(\alpha^\prime,\beta^\prime)$, with \pause \begin{itemize} \item[] $\alpha^\prime = \alpha + \sum_{i=1}^nx_i$ \item[] $\beta^\prime = \beta + n-\sum_{i=1}^nx_i$ \end{itemize} \pause So that's it! %} % End size \end{frame} \begin{frame} \frametitle{Conjugate Priors} \pause %\framesubtitle{} \begin{itemize} \item Prior was Beta$(\alpha,\beta)$. \pause \item Posterior is Beta$(\alpha^\prime,\beta^\prime)$. \pause \item Prior and posterior are in the same family of distributions. \pause \item The Beta is a \emph{conjugate prior} for the Bernoulli model. \pause \item Posterior was obtained by inspection. \pause \item Conjugate priors are very convenient. \pause \item There are conjugate priors for many models. \pause \item There are also important models for which conjugate priors do not exist. \end{itemize} \end{frame} \begin{frame} \frametitle{Picture of the posterior} %\framesubtitle{} {\footnotesize Suppose 60 out of 100 consumers picked the new blend of coffee beans. \pause Posterior is Beta, with $\alpha^\prime = \alpha + \sum_{i=1}^nx_i = 61$ and $\beta^\prime = \beta + n-\sum_{i=1}^nx_i = 41$. } % End size \pause \begin{center} \includegraphics[width=2.5in]{Posterior} \end{center} \end{frame} \begin{frame} \frametitle{Estimation} %\framesubtitle{} \begin{itemize} \item Question: How should I estimate $\theta$? \pause \item Answer to the question is another question: \pause What is your loss function? \pause \item First, what is the decision space? \pause \item $\mathcal{D} = (0,1)$, same as the parameter space. \pause \item $d \in \mathcal{D}$ is a guess about the value of $\theta$. \pause \item The loss function is up to you, but surely the more you are wrong, the more you lose. \pause \item How about squared error loss? \item $L(d,\theta) = k(d-\theta)^2$ \pause \item We can omit the proportionality constant $k$. \end{itemize} \end{frame} \begin{frame} \frametitle{Minimize expected loss} \framesubtitle{$L(d,\theta) = (d-\theta)^2$} \pause Denote $E(\theta|X=x)$ by $\mu$. \pause Then \begin{eqnarray*} E\left(L(d,\theta)|X=x\right) \pause & = & E\left((d-\theta)^2|X=x\right) \\ \pause & = & E\left((d{\color{red}-\mu+\mu}-\theta)^2|X=x\right) \\ \pause & = & \ldots \\ \pause & = & E\left((d-\mu)^2|X=x\right) + E\left((\theta-\mu)^2|X=x\right) \\ \pause & = & (d-\mu)^2 \pause + Var(\theta|X=x) \pause \end{eqnarray*} \begin{itemize} \item Minimal when $d = \mu = E(\theta|X=x)$, the posterior mean. \pause \item This was general. \pause \item The Bayes estimate under squared error loss is the posterior mean. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Back to the example} \framesubtitle{Give the Bayes estimate of $\theta$ under squared error loss.} \pause Posterior distribution of $\theta$ is Beta, with $\alpha^\prime = \alpha + \sum_{i=1}^nx_i = 61$ and $\beta^\prime = \beta + n-\sum_{i=1}^nx_i = 41$. \pause %{\footnotesize % or scriptsize {\color{blue} \begin{verbatim} > 61/(61+41) \end{verbatim} } % End color \begin{verbatim} [1] 0.5980392 \end{verbatim} %} % End size \end{frame} \begin{frame} \frametitle{Hypothesis Testing} \framesubtitle{$\theta>\frac{1}{2}$ means consumers tend to prefer the new blend of coffee.} Test $H_0:\theta\leq \theta_0$ versus $H_1:\theta> \theta_0$. \pause \begin{itemize} \item What is the loss function? \pause \item When you are wrong, you lose. \pause \item Try zero-one loss. \pause \end{itemize} \begin{center} \begin{tabular}{|c|c|c|} \hline & \multicolumn{2}{c|}{Loss $L(d_j,\theta)$} \\ \hline Decision & When $\theta\leq \theta_0$ & When $\theta> \theta_0$ \\ \hline $d_0: \theta\leq \theta_0$ & 0 & 1 \\ \hline $d_1: \theta> \theta_0$ & 1 & 0 \\ \hline \end{tabular} \end{center} \end{frame} \begin{frame} \frametitle{Compare expected loss for $d_0$ and $d_1$} \pause %\framesubtitle{$\theta>\frac{1}{2}$ means consumers tend to prefer the new blend of coffee.} \begin{center} \begin{tabular}{|c|c|c|} \hline & \multicolumn{2}{c|}{Loss $L(d_j,\theta)$} \\ \hline Decision & When $\theta\leq \theta_0$ & When $\theta>\theta_0$ \\ \hline $d_0: \theta\leq \theta_0$ & 0 & 1 \\ \hline $d_1: \theta> \theta_0$ & 1 & 0 \\ \hline \end{tabular} \end{center} \pause Note $L(d_0,\theta) = I(\theta>\theta_0)$ and $L(d_1,\theta) = I(\theta\leq\theta_0)$. \pause \begin{eqnarray*} E\left( I(\theta>\theta_0)|X=x\right) & = & P\left( \theta>\theta_0|X=x\right) \\ \pause E\left( I(\theta\leq\theta_0)|X=x\right) & = & P\left( \theta\leq\theta_0|X=x\right) \pause \end{eqnarray*} \begin{itemize} \item Choose the smaller posterior probability of being wrong. \pause \item Equivalently, reject $H_0$ if $P(H_0|X=x) < \frac{1}{2}$. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Back to the example} \framesubtitle{Decide between $H_0:\theta\leq 1/2$ and $H_1:\theta> 1/2$ under zero-one loss.} \pause Posterior distribution of $\theta$ is Beta, with $\alpha^\prime = \alpha + \sum_{i=1}^nx_i = 61$ and $\beta^\prime = \beta + n-\sum_{i=1}^nx_i = 41$. \pause \vspace{10mm} Want $P(\theta>\frac{1}{2}|X=x)$ \pause %{\footnotesize % or scriptsize {\color{blue} \begin{verbatim} > 1 - pbeta(1/2,61,41) # P(theta > theta0|X=x) \end{verbatim} } % End color \begin{verbatim} [1] 0.976978 \end{verbatim} %} % End size \end{frame} \begin{frame} \frametitle{How much worse is a Type I error?} \begin{center} \begin{tabular}{|c|c|c|} \hline & \multicolumn{2}{c|}{Loss $L(d_j,\theta)$} \\ \hline Decision & When $\theta\leq \theta_0$ & When $\theta> \theta_0$ \\ \hline $d_0: \theta\leq \theta_0$ & 0 & 1 \\ \hline $d_1: \theta> \theta_0$ & k & 0 \\ \hline \end{tabular} \end{center} \pause To conclude $H_1$, posterior probability must be at least $k$ times as big as posterior probability of $H_0$. \pause $k=19$ is attractive. \pause \vspace{5mm} A realistic loss function for the taste test would be more complicated. \end{frame} \section{Computation} \begin{frame} \frametitle{Computation} %\framesubtitle{} \begin{itemize} \item Inference will be based on the posterior. \pause \item Must be able to calculate $E(g(\theta)|X=x)$ \pause \item For example, $E(L(d,\theta)|X=x)$ \pause \item Or at least \begin{displaymath} \int L(d,\theta) f(x|\theta)\pi(\theta) \, d\theta. \end{displaymath} \pause \item If $\theta$ is of low dimension, numerical integration usually works. \pause \item For high dimension, it can be tough. \end{itemize} \end{frame} \begin{frame} \frametitle{Monte Carlo Integration to get $E(g(\theta)|X=x)$} \framesubtitle{Based on simulation from the posterior} \pause Sample $\theta_1, \ldots, \theta_m$ independently from the posterior distribution \pause and calculate {\LARGE \begin{displaymath} \frac{1}{m} \sum_{j=1}^m g(\theta_j) \pause \stackrel{a.s.}{\rightarrow} E(g(\theta)|X=x) \end{displaymath} } % End size \begin{itemize} \item[] By the Law of Large Numbers. \pause \item[] Large-sample confidence interval is helpful. \end{itemize} \end{frame} \begin{frame} \frametitle{Sometimes it's Hard} %\framesubtitle{} \begin{itemize} \item If the posterior is a familiar distribution (and you know what it is), simulating values from the posterior should be routine. \pause \item If the posterior is unknown or very unfamiliar, it's a challenge. \end{itemize} \end{frame} \begin{frame} \frametitle{The Gibbs Sampler} \framesubtitle{Geman and Geman (1984)} \pause \begin{itemize} \item $\theta = (\theta_1, \ldots, \theta_k)$ is a random vector with a (posterior) joint distribution. \pause \item It is relatively easy to sample from the conditional distribution of each component given the others. \pause \item Algorithm, \pause say for $\theta = (\theta_1,\theta_2,\theta_3)$: \linebreak \pause First choose starting values of $\theta_2$ and $\theta_3$ somehow. Then, \pause \begin{itemize} \item Sample from the conditional distribution of $\theta_1$ given $\theta_2$ and $\theta_3$. \pause Set $\theta_1$ to the resulting number. \pause \item Sample from the conditional distribution of $\theta_2$ given $\theta_1$ and $\theta_3$. \pause Set $\theta_2$ to the resulting number. \pause \item Sample from the conditional distribution of $\theta_3$ given $\theta_1$ and $\theta_2$. \pause Set $\theta_3$ to the resulting number. \pause \end{itemize} Repeat. \end{itemize} \end{frame} \begin{frame} \frametitle{Output} \begin{itemize} \item The Gibbs sampler produces a sequence of random $(\theta_1,\theta_2,\theta_3)$ vectors. \pause \item Each one depends on the past only through the most recent one.\pause \item It's a Markov process.\pause \item Under technical conditions (Ergodicity), it has a stationary distribution that is the desired (posterior) distribution.\pause \item Stationarity is a $\rightarrow \infty$ concept.\pause \item In practice, a ``burn in" period is used.\pause \item The random vectors are sequentially dependent.\pause \item Time series diagnostics may be helpful. \pause \item Retain one parameter vector every ``$n$" iterations, and discard the rest. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \vspace{5mm} \href{http://www.utstat.toronto.edu/~brunner/oldclass/2453y15-16} {\small\texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/2453y15-16}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Coffee posterior alpha=1; beta = 1 n = 100; sumx = 60 a = alpha+sumx; b = beta + n - sumx theta = seq(from=0,to=1,by=0.005) Density = dbeta(theta,a,b) posterior = expression(paste(pi,"(",theta,"|x)",sep="")) plot(theta,Density,type='l',xlab = expression(theta),ylab = posterior) title("Posterior Density")