% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Supress navigation symbols \usetheme{AnnArbor} % CambridgeUS Blue and yellow, Shows current section title % \usetheme{Berlin} % Displays sections on top \usepackage[english]{babel} % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode % \mode{\setbeamercolor{background canvas}{bg=black!5}} \title{Proportional Hazards Regression\footnote{See last slide for copyright information.}} \subtitle{STA312 Spring 2019} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Background Reading} %\framesubtitle{} Chapter 5 in \emph{Applied Survival Analysis Using R} by Dirk Moore \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Overview} \tableofcontents \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Model} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Proportional Hazards} \pause %\framesubtitle{} \begin{itemize} \item Suppose two individuals have different $\mathbf{x}$ vectors of explanatory variable values. \pause \item They have different hazard functions because their $\lambda$ values are different. \pause \item But the \emph{hazard ratio} $\displaystyle \frac{h_1(t)}{h_2(t)}$ \pause does not depend on time $t$. \pause \item Exponential regression and Weibull regression fit this pattern. \pause \item Proportional hazards regression is a generalization. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Proportional Hazards Regression} \framesubtitle{Also called Cox regression after Sir David Cox} \pause Write the hazard function {\LARGE \begin{eqnarray*} h_i(t|\boldsymbol{\beta}) & = & h_0(t) \, \psi_i(\boldsymbol{\beta}) \\ \pause & = & h_0(t) \, e^{\mathbf{x}_i^\top \boldsymbol{\beta}} \pause \end{eqnarray*} } % End size \begin{itemize} \item $h_0(t)$ is called the \emph{baseline hazard function}. \pause \item Baseline because it's the hazard function when $\psi(\boldsymbol{\beta}) = 1$. \pause \item Maybe the patient is in the reference category, and the quantitative explanatory variables are centered. \pause \item In theory $\psi(\boldsymbol{\beta})$ could be almost anything as long as the resulting hazard function is positive. \pause \item But in practice it's almost always $e^{\mathbf{x}_i^\top \boldsymbol{\beta}}$, Cox's original suggestion. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Exponential and Weibull Regression} \framesubtitle{$h_i(t|\boldsymbol{\beta}) = h_0(t) \, \psi_i(\boldsymbol{\beta}) = h_0(t) \, e^{\mathbf{x}_i^\top \boldsymbol{\beta}}$} \pause \begin{itemize} \item Exponential regression: $h_i(t|\boldsymbol{\beta}) = \lambda = e^{-\mathbf{x}_i^\top \boldsymbol{\beta}}$ \pause \begin{itemize} \item $h_0(t)=1$ \item $\psi_i(\boldsymbol{\beta}) = e^{-\mathbf{x}_i^\top \boldsymbol{\beta}}$ \end{itemize} \pause \item Weibull regression: $h_i(t|\boldsymbol{\beta}) = \frac{1}{\sigma} \exp\{-\frac{1}{\sigma}\mathbf{x}_i^\top \boldsymbol{\beta}\}t^{\frac{1}{\sigma}-1}$ \pause \begin{itemize} \item $h_0(t)=\frac{1}{\sigma}t^{\frac{1}{\sigma}-1}$ \item $\psi_i(\boldsymbol{\beta}) = \exp\{-\frac{1}{\sigma}\mathbf{x}_i^\top \boldsymbol{\beta}\}$ \end{itemize} \pause \item Are these really special cases of the proportional hazards model, with $\psi_i(\boldsymbol{\beta}) = e^{\mathbf{x}_i^\top \boldsymbol{\beta}}$? \pause \item Yes, by a re-parameterization. \pause $\beta_j$ of proportional hazards = $-\beta_j$ of exponential regression. \pause \item $\beta_j$ of proportional hazards = $-\beta_j/\sigma$ of Weibull regression. \pause \item The main implication is that in proportional hazards regression, the coefficients mean the opposite of what you are used to. \pause \item Anything that makes $\mathbf{x}_i^\top \boldsymbol{\beta}$ bigger will increase the hazard, and make the chances of survival \emph{smaller}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Hazard Ratio} \pause %\framesubtitle{} Form a ratio of hazard functions. \pause In the numerator, increase $x_{i,k}$ by one unit while holding all other $x_{i,j}$ values constant. \pause \begin{eqnarray*} \frac{h_1(t)}{h_2(t)} & = & \frac{h_0(t) \exp\{\beta_0 + \beta_1x_{i,1} + \cdots + \beta_k (x_{i,k}+1) + \cdots + \beta_{p-1}x_{i,p-1} \}} {h_0(t) \exp\{\beta_0 + \beta_1x_{i,1} + \cdots + \beta_k x_{i,k} + \cdots + \beta_{p-1}x_{i,p-1} \}} \\ \pause & = & e^{\beta_k} \end{eqnarray*} \pause \begin{itemize} \item Holding the other $x_{i,j}$ values constant is the meaning of ``controlling" for explanatory variables. \pause \item If $\beta_k>0$, increasing $x_{i,k}$ increases the hazard. \pause \item If $\beta_k<0$, increasing $x_{i,k}$ decreases the hazard. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Semi-parametric} \framesubtitle{$h_i(t|\boldsymbol{\beta}) = h_0(t) \, e^{\mathbf{x}_i^\top \boldsymbol{\beta}}$} \pause \begin{itemize} \item The unknown quantities in the model are the vector of regression parameters $\boldsymbol{\beta}$, \pause and the unknown baseline hazard function $h_0(t)$. \pause \item We can avoid making any assumptions about $h_0(t)$. \pause \item But because of $\boldsymbol{\beta}$, it's at least partly parametric. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Estimation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimation: Using Ideas From Kaplan-Meier} \pause %\framesubtitle{} \begin{itemize} \item As in the Kaplan-Meier estimate, we focus on the uncensored observations, for which the failure time is known. \pause \item The censored observations will have their influence by disappearing from the set of individuals at risk. \pause \item There are $D = \sum_{i=1}^n\delta_i$ uncensored observations. \pause \item Denote the ordered times at which failures occur by $t_1, \ldots t_D$. \pause \item This notation can be confusing, because the entire set of times, including censoring times, is usually denoted $t_1, \ldots t_D$. \pause \item Some books (for example Chapter 3 in \emph{Applied Survival Analysis} by Hosmer and Lemeshow, available from \href{https://b-ok.org} {\texttt{https://b-ok.org}}) use the notation $t_{(1)}, \ldots t_{(D)}$. \pause \item The index set of individuals at risk at failure time $t_j$ is $R_j$. \pause \item One of them fails. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Hazard} \pause %\framesubtitle{} \begin{itemize} \item The hazard function $h(t_j) = \lim_{\Delta \rightarrow 0} \frac{P\left(t_j \leq T \leq t_j+ \Delta|T\geq t_j\right)}{\Delta}$ \pause is roughly proportional to the probability of failure at time $t_j$, conditionally on survival to that point. \item Make it an actual probability. \pause Normalize it, dividing by the total hazards of all the individuals at risk: \pause \begin{displaymath} q_i = 1-p_i = \frac{h_0(t) e^{\mathbf{x}_i^\top \boldsymbol{\beta}}} {\displaystyle \sum_{j \in R_i} h_0(t) e^{\mathbf{x}_j^\top \boldsymbol{\beta}}} \pause = \frac{e^{\mathbf{x}_i^\top \boldsymbol{\beta}}} {\displaystyle \sum_{j \in R_i} e^{\mathbf{x}_j^\top \boldsymbol{\beta}}} \end{displaymath} \pause \item First, notice that the baseline hazard cancels. \pause \item These really are like the $p_i$ and $q_i$ in Kaplan-Meier estimation. \pause \item Except, instead of dividing by the \emph{number} of individuals at risk, \pause they are weighted by their hazards. \pause \item And those hazards depend on the explanatory variable values through $\boldsymbol{\beta}$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimating $\boldsymbol{\beta}$} \pause %\framesubtitle{} Now we have failure probabilities $q_i = \frac{e^{\mathbf{x}_i^\top \boldsymbol{\beta}}} {\displaystyle \sum_{j \in R_i} e^{\mathbf{x}_j^\top \boldsymbol{\beta}}}$. \pause \vspace{2mm} How can these be used to estimate $\boldsymbol{\beta}$? \pause Cox suggested \pause \begin{itemize} \item Multiply them together and treat them as a likelihood. \pause \item Take the minus log, and minimize. \pause \item He suggested that all the usual likelihood theory should hold. \pause \item Fisher information, asymptotic normality, likelihood ratio tests: everything. \pause \item He called it \emph{partial} likelihood. \pause \item \textbf{Why?!} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Partial Likelihood} \pause %\framesubtitle{} Using $h(t) = \frac{f(t)}{S(t)}$, \begin{eqnarray*} L(\theta) & = & \prod_{i=1}^n f(t_i|\theta)^{\delta_i} \, S(t_i|\theta)^{1-\delta_i} \\ \pause & = & \prod_{i=1}^n \left( h(t_i|\theta)S(t_i|\theta) \right)^{\delta_i} \, S(t_i|\theta)^{1-\delta_i} \\ \pause & = & \prod_{i=1}^n h(t_i|\theta)^{\delta_i} \, S(t_i|\theta)^{\delta_i+1-\delta_i} \\ \pause & = & \prod_{i=1}^n h(t_i|\theta)^{\delta_i} \, S(t_i|\theta) \\ \pause & = & \prod_{i=1}^D h(t_{(i)}|\theta) \prod_{i=1}^n S(t_i|\theta) \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Continuing the likelihood calculation} \pause %\framesubtitle{} \begin{eqnarray*} L(\theta) & = & \prod_{i=1}^D h(t_{(i)}|\theta) \prod_{i=1}^n S(t_i|\theta) \\ \pause & = & \prod_{i=1}^D h_0(t_{(i)}) e^{\mathbf{x}_{(i)}^\top \boldsymbol{\beta}} \prod_{i=1}^n S(t_i|\boldsymbol{\beta},h_0) \\ \pause & = & \frac{\displaystyle \prod_{i=1}^D h_0(t_{(i)}) e^{\mathbf{x}_{(i)}^\top \boldsymbol{\beta}} } {\displaystyle \prod_{i=1}^D \sum_{j \in R_{(i)}} h_0(t_{(i)}) e^{\mathbf{x}_j^\top \boldsymbol{\beta}}} \left(\prod_{i=1}^D \sum_{j \in R_{(i)}}h_0(t_{(i)}) e^{\mathbf{x}_j^\top \boldsymbol{\beta}} \right) \prod_{i=1}^n S(t_i|\boldsymbol{\beta},h_0) \\ \pause & = & \prod_{i=1}^D \frac{\displaystyle h_0(t_{(i)}) e^{\mathbf{x}_{(i)}^\top \boldsymbol{\beta}} } {\displaystyle \sum_{j \in R_{(i)}} h_0(t_{(i)}) e^{\mathbf{x}_j^\top \boldsymbol{\beta}}} \left(\prod_{i=1}^D \sum_{j \in R_{(i)}}h_0(t_{(i)}) e^{\mathbf{x}_j^\top \boldsymbol{\beta}} \right) \prod_{i=1}^n S(t_i|\boldsymbol{\beta},h_0) \\ \pause \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Partial Likelihood} %\framesubtitle{} \begin{displaymath} L(\boldsymbol{\beta},h_0) = {\color{red} \prod_{i=1}^D \left( \frac{\displaystyle e^{\mathbf{x}_{(i)}^\top \boldsymbol{\beta}} } {\displaystyle \sum_{j \in R_{(i)}} e^{\mathbf{x}_j^\top \boldsymbol{\beta}}} \right) }% End color \left(\prod_{i=1}^D \sum_{j \in R_{(i)}}h_0(t_{(i)}) e^{\mathbf{x}_j^\top \boldsymbol{\beta}} \right) \prod_{i=1}^n S(t_i|\boldsymbol{\beta},h_0) \end{displaymath} \pause \begin{itemize} \item The red product is Cox's partial likelihood. \pause \item Properties similar to ordinary likelihood were proved years later. \pause \item There are fairly convincing arguments that the black product is negligible for large samples. \pause \item Lack of dependence on the baseline hazard is a good feature. \pause \item This is the state of the art. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Hypothesis Tests} \pause %\framesubtitle{} As Cox hypothesized, all the usual likelihood theory applies to partial likelihood.\pause \begin{itemize} \item Consistency (i.e., large-sample accuracy) \pause \item Asymptotic normality. \pause \item Fisher information \pause \item $Z$-tests \pause \item Wald tests \pause \item Score tests \pause \item Likelihood ratio tests \pause \item Call them \emph{partial} likelihood ratio tests. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimating the Survival Function: Background} \framesubtitle{Using $H(t) = \int_0^t h(x) \, dx$ and $S(t) = e^{-H(t)}$} \pause \begin{itemize} \item Proportional hazards says $h(t|\boldsymbol{\beta}) = h_0(t) \, e^{\beta_0 + \mathbf{x}_i^\top \boldsymbol{\beta}}$ \pause \item This makes it clear that $h_0(t) \, e^{\beta_0}$ cancels in numerator and denominator of the partial likelihood. \pause \item $h_0(t)$ is the hazard function when all explanatory variable values are zero \emph{and} $\beta_0=0$. \pause \item $H_0(t) = \int_0^t h_0(x) \, dx$ is the baseline cumulative hazard function. \pause \item $S_0(t) = e^{-H_0(t)}$ is the baseline survival function. \pause \item With a little work we can show $S(t) = S_0(t)^{\exp\{\beta_0 + \mathbf{x}_i^\top \boldsymbol{\beta} \}}$. \pause \item This could be written $S(t|\mathbf{x}_i)$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % The original version had something that was plain wrong. I had trouble understanding this. \begin{frame} \frametitle{Estimation (Cox and Oakes, 1982, p. 108)} \framesubtitle{Using $S_0(t) = e^{-H_0(t)}$ and $S(t|\mathbf{x}_i) = S_0(t)^{\exp\{\beta_0 + \mathbf{x}_i^\top \boldsymbol{\beta} \}}$} \pause Cox suggested $H_0(t) \approx \displaystyle \sum_{t_{(i)}< t } \frac{d_{(i)}} {\displaystyle \sum_{j \in R_{(i)}} e^{\beta_0 + \mathbf{x}_j^\top \boldsymbol{\beta}}}$. \pause Multiplying both sides by $e^{\beta_0}$\pause, which is invisible in Cox's argument\pause, arrive at \pause \begin{displaymath} e^{\widehat{\beta}_0}\widehat{H}_0(t) = \sum_{t_{(i)}< t } \frac{d_{(i)}} {\displaystyle \sum_{j \in R_{(i)}} e^{\mathbf{x}_j^\top \widehat{\boldsymbol{\beta}}}} \end{displaymath} \pause Then, $e^{-\widehat{H}_0(t) e^{\widehat{\beta}_0}} = \widehat{S}_0(t)^{e^{\widehat{\beta}_0}}$. \pause Raise that to the power $\mathbf{x}_i^\top \widehat{\boldsymbol{\beta}}$, and get \pause {\LARGE \begin{displaymath} \widehat{S}_0(t)^{e^{\widehat{\beta}_0 + \mathbf{x}_i^\top \widehat{\boldsymbol{\beta}}}} \pause = \widehat{S}(t|\mathbf{x}_i) \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{It works} \pause %\framesubtitle{} \begin{itemize} \item As usual, later work clarified matters and eliminated most of the guesswork. \pause \item Cox's estimate of $S(t)$ is show to arise from Breslow's method of approximating the partial likelihood when there are ties. \pause \item There are several other estimates, all yielding results that are pretty close. \pause \item In every case, $\beta_0$ is there, but usually it's invisible. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistics, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/312s19} {\footnotesize \texttt{http://www.utstat.toronto.edu/$^\sim$brunner/oldclass/312s19}} \end{frame} \end{document}