% \documentclass[serif]{beamer} % Serif for Computer Modern math font.
\documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements
\hypersetup{colorlinks,linkcolor=,urlcolor=red}

\usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice!
\setbeamertemplate{navigation symbols}{} % Supress navigation symbols
\usetheme{AnnArbor}  % CambridgeUS Blue and yellow, Shows current section title
% \usetheme{Berlin} % Blue: Displays section titles on top
% \usetheme{Frankfurt}  % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides
\usepackage[english]{babel}
\usepackage{comment}
% \definecolor{links}{HTML}{2A1B81}
% \definecolor{links}{red}

\setbeamertemplate{footline}[frame number] 

\mode<presentation>
% \mode<handout>{\setbeamercolor{background canvas}{bg=black!5}}


\title{The Kaplan-Meier (Product Limit) Estimate\footnote{See last slide for copyright information.}}
\subtitle{STA312 Fall 2023}
\date{} % To suppress date


\begin{document}

\begin{frame}
  \titlepage
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\begin{frame}
\frametitle{The Kaplan-Meier Estimate} 
\framesubtitle{Reference: Chapter 3 in \emph{Applied Survival Analysis Using R}}  
\begin{itemize}
    \item Objective: To estimate the survival function without making any assumptions about the distribution of survival time. \pause
    \item If there were no censoring, it would be easy. 
    \item Use the empirical distribution function:  the proportion of observations less than or equal to $t$. 

\begin{displaymath}
    \widehat{F}_n(t) = \frac{1}{n} \sum_{i=1}^n I\{t_i \leq t \}
\end{displaymath} \pause
    \item Then let $\widehat{S}_n(t) = 1-\widehat{F}_n(t)$
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Discrete Time}
\framesubtitle{Maybe time is always discrete in practice}  \pause
\begin{itemize}
    \item Consider times $t_0=0, t_1, t_2, \dots$, maybe minutes or days. \pause
    \item Let $q_j = $ the probability of failing at time $t_j$, given survival to time $t_{j-1}$.
    \item This is the \emph{idea} behind the hazard function. \pause
    \item $p_j = 1-q_j = $ the probability of surviving past time $t_j$, given survival past time $t_{j-1}$. \pause
\end{itemize}

\begin{eqnarray*}
    p_j & = & P(T>t_j|T>t_{j-1}) \\ \pause
    & = &  \frac{P(T>t_j,T>t_{j-1})}{P(T>t_{j-1})} \\ \pause
    & = &  \frac{P(T>t_j)}{P(T>t_{j-1})}   \\ \pause
    & = &  \frac{S(t_j)}{S(t_{j-1})}
\end{eqnarray*} 
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{$p_j = \frac{S(t_j)}{S(t_{j-1})}$}
\framesubtitle{Probability of surviving past time $t_j$, given survival past time $t_{j-1}$}  \pause
With $S(t_0) = S(0)=1$, \pause
\begin{itemize}
    \item $p_1 = \frac{S(t_1)}{S(t_0)} = \frac{S(t_1)}{1} = S(t_1)$ \pause
    \item $p_2 = \frac{S(t_2)}{S(t_1)}$ \pause
    \item $p_3 = \frac{S(t_3)}{S(t_2)}$ \pause
    \item Continuing  \ldots 
    \item $p_k = \frac{S(t_k)}{S(t_{k-1})}$ \pause
\end{itemize}
Then, 
\begin{eqnarray*}
    &   & ~p_1 ~~~~~ p_2 ~~~~~ p_3 ~~ \cdots ~~~p_k \\ \pause
    & = & S(t_1) \frac{S(t_2)}{S(t_1)} \frac{S(t_3)}{S(t_2)}  \cdots \frac{S(t_k)}{S(t_{k-1})} \\ \pause
    & = & S(t_k)
\end{eqnarray*} 
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{$\displaystyle S(t_k) = \prod_{j=1}^k p_j$}
%\framesubtitle{} 
Estimate $S(t_k)$ by estimating the $p_j$. \pause
\begin{itemize}
    \item Let $d_j$ be the number of deaths at time $t_j$. \pause
    \item Let $n_j$ be the number of individuals at risk before time $t_j$. \pause
    \item Anyone censored before time $t_j$ is no longer at risk. \pause
    \item Estimated probability of failure at time $t_j$ is $\widehat{q}_j = \frac{d_j}{n_j}$. \pause
    \vspace{2mm}
    
\end{itemize}
{\large
\begin{eqnarray*}
    \widehat{p}_j & = & 1-\widehat{q}_j \pause = \frac{n_j-d_j}{n_j}  \\ \pause
    \widehat{S}(t_k) & = & \prod_{j=1}^k \widehat{p}_j \\ \pause
    \widehat{S}(t) & = & \prod_{t_j \leq t} \widehat{p}_j
\end{eqnarray*} 
} % End size

\end{frame}

% At this point I could give a computational illustration of n_j d_j, p_j

\begin{comment}

rm(list=ls()); options(scipen=999)
wdata = read.table("http://www.utstat.utoronto.ca/brunner/data/legal/Weibull.data2.txt")
head(wdata)
Time = wdata$Time; Uncensored = wdata$Uncensored; length(Time)

timz = sort(unique(Time)); length(timz)
timz[1:40]

tab = table(Time,Uncensored); tab

> tab = table(Time,Uncensored); tab
      Uncensored
Time   0 1
  0.01 3 0
  0.07 1 0
  0.08 1 0
  0.11 1 0
  0.14 1 0
  0.18 1 0
  0.2  1 0
  0.22 1 0
  0.26 1 0
  0.3  2 0
  0.34 0 1  n_j = 275 - (3+1+1+1+1+1+1+1+1+2) = 262
  0.35 2 0
  0.36 1 0
  0.38 1 0
  0.43 1 0
  0.45 1 0
  0.53 1 0
  0.6  2 0
  0.61 0 1  n_j = 275 - sum(tab[1:18,]) = 252
  0.65 1 0
  0.69 1 0
  0.72 1 0
  0.73 1 0
  0.75 1 0
  0.84 1 0
  0.89 1 0
  0.96 1 0
  0.97 1 0
  1.02 1 0
  1.06 1 0
  1.07 1 1  n_j = 275 - sum(tab[1:30,]) = 240


\end{comment}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Working toward a standard error for 
$ \displaystyle \widehat{S}(t) = \prod_{t_j \leq t} \widehat{p}_j$}
\framesubtitle{Large-sample Distribution Theory} \pause

\begin{itemize}
    \item $\widehat{p}_j = 1-\frac{d_j}{n_j} = \frac{n_j-d_j}{n_j}$ \pause is a sample proportion -- a sample mean. \pause
    \item It is the proportion of individuals eligible at risk for failure at time $t$, who did not fail. \pause
    \item Mean of independent Bernoullis (conditionally on $n_j$). \pause
    \item $E(\widehat{p}_j) = p_j$, $Var(\widehat{p}_j) = \frac{p_j(1-p_j)}{n_j}$ \pause
    \item $\widehat{p}_j \stackrel{.}{\sim} N(p_j, \frac{p_j(1-p_j)}{n_j})$ by the Central Limit Theorem. \pause
    \item This is for large $n_j$.
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Recall}
\framesubtitle{Theorem based on the delta method of Cram\'{e}r}   
Let $\boldsymbol{\theta} \in \mathbb{R}^k$.   Under the conditions for which $\widehat{\boldsymbol{\theta}}_n$ is asymptotically $N_k\left(\boldsymbol{\theta}, \mathbf{V}_n\right)$ with $\mathbf{V}_n = \frac{1}{n} \boldsymbol{\mathcal{I}}(\boldsymbol{\theta})^{-1}$,   let the function $g: \mathbb{R}^k \rightarrow \mathbb{R}$   be such that    the elements of  
\.{g}$(\boldsymbol{\theta}) = \left( \frac{\partial g}{\partial\theta_1}, \ldots ,  \frac{\partial g}{\partial\theta_k} \right)$   are continuous in a neighbourhood of the true parameter vector $\boldsymbol{\theta}$.   Then
\begin{displaymath}
    g(\widehat{\boldsymbol{\theta}}) \stackrel{.}{\sim} 
    N\left( g(\boldsymbol{\theta}), \mbox{\.{g}}(\boldsymbol{\theta}) \mathbf{V}_n \,
     \mbox{\.{g}}(\boldsymbol{\theta})^\top \right).
\end{displaymath}  
Note that the asymptotic variance $\mbox{\.{g}}(\boldsymbol{\theta}) \mathbf{V}_n \,
     \mbox{\.{g}}(\boldsymbol{\theta})^\top$ is a matrix product:   $(1 \times k)$ times $(k \times k)$ times $(k \times 1)$.  
\vspace{4mm}

The standard error of $g(\widehat{\boldsymbol{\theta}})$ is 
$\sqrt{\mbox{\.{g}}(\widehat{\boldsymbol{\theta}}) \widehat{\mathbf{V}}_n  \, \mbox{\.{g}}(\widehat{\boldsymbol{\theta}})^\top}$.
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Specializing the delta method to the case of a single parameter}  
\framesubtitle{Yielding the univariate delta method} \pause
Let $\boldsymbol{\theta} \in \mathbb{R}$.   Under the conditions for which $\widehat{\theta}_n$ is asymptotically $N\left(\theta, v_n\right)$ with $v_n = \frac{1}{n} \, I(\theta)$,   let the function $g(x)$ have a continuous derivative in a neighbourhood of the true parameter $\theta$.   Then
\begin{displaymath}
    g(\widehat{\theta}) \stackrel{.}{\sim} 
    N\left( g(\theta), g^\prime(\theta)^2 \, v_n \right).
\end{displaymath}  

\vspace{4mm}

The standard error of $g(\widehat{\theta})$ is 
$\sqrt{ g^\prime(\widehat{\theta})^2 \, \widehat{v}_n}$ , or
$\left|g^\prime(\widehat{\theta}) \right|\sqrt{\widehat{v}_n}$
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Large-sample Distribution Theory Continued}
\framesubtitle{$\displaystyle \widehat{S}(t) = \prod_{t_j \leq t} \widehat{p}_j $ with $\widehat{p}_j = \frac{n_j-d_j}{n_j}
\stackrel{.}{\sim} N\left(p_j, \frac{p_j(1-p_j)}{n_j}\right)$} \pause
  \begin{itemize}
    \item Sums are easier to work with than products. \pause
    \item $\log \widehat{S}(t) = \sum_{t_j \leq t} \log \widehat{p}_j$ \pause
    \item Using the one-variable delta method, % Need a HW problem on this.
          $\log \widehat{p}_j \stackrel{.}{\sim} N(\log p_j, \frac{1-p_j}{n_jp_j})$ \pause
    \item Sum of normals is normal  (asymptotically, too). \pause
    \item $E(\sum_{t_j \leq t} \log \widehat{p}_j) \approx \sum_{t_j \leq t} \log p_j  \pause
           = \log \prod_{t_j \leq t} \, p_j \pause =  \log S(t)$ \pause
        
  \end{itemize}
\begin{eqnarray*}
    Var\left(\sum_{t_j \leq t} \log \widehat{p}_j \right) 
    & \approx & \pause \sum_{t_j \leq t} Var(\log \widehat{p}_j)\\ \pause 
    & = & \sum_{t_j \leq t}  \frac{1-p_j}{n_jp_j}
\end{eqnarray*}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Asymptotic Distribution of $\displaystyle \log \widehat{S}(t) =  \sum_{t_j \leq t} \log \widehat{p}_j$}  
%\framesubtitle{}
\begin{displaymath}
    \log \widehat{S}(t) \stackrel{.}{\sim}
    N\left( \log S(t) , \sum_{t_j \leq t}  \frac{1-p_j}{n_jp_j} \right)
\end{displaymath} \pause

  \begin{itemize}
    \item This is a stepping stone to the distribution of $\widehat{S}(t)$. 
    \item Use the univariate delta method again. \pause 
    \item Univariate delta method says that if $T_n \stackrel{.}{\sim} N(\theta,v_n)$ 
          then \pause $g(T_n) \stackrel{.}{\sim} N\left(g(\theta),v_n [g^\prime(\theta)]^2\right)$. \pause
    \item Here, $T_n = \log \widehat{S}_n(t)$,  $\theta = \log S(t)$  and $g(x) = e^x.$ \pause
    \item $g^\prime(\theta) = e^\theta \pause = e^{\log S(t)} \pause = S(t)$. \pause So, \pause
  \end{itemize}
  
\begin{displaymath}
    \widehat{S}(t) \stackrel{.}{\sim}
    N\left( S(t) , S(t)^2\sum_{t_j \leq t}  \frac{1-p_j}{n_jp_j} \right)
\end{displaymath} 
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Standard error of $\widehat{S}(t)$}
\framesubtitle{Used in the denominator of $Z$-tests and $\widehat{S}(t) \pm 1.96 \, se$} 
\begin{displaymath}
    \widehat{S}(t) \stackrel{.}{\sim}
    N\left( S(t) , S(t)^2\sum_{t_j \leq t}  \frac{1-p_j}{n_jp_j} \right)
\end{displaymath} \pause

  \begin{itemize}
    \item Of course we don't know $S(t)$ or $p_j$ in the variance. 
    \item So use estimates. \pause  
    \item Estimate $S(t)$ with $\widehat{S}(t)$, and estimate $p_j$ with $\widehat{p}_j = \frac{n_j-d_j}{n_j}$. \pause
    \item The resulting estimated asymptotic variance is 
          $\widehat{S}(t)^2 \sum_{t_j \leq t} \left(\frac{d_j}{n_j(n_j-d_j)}\right)$ \pause
    \item  This is expression (3.1.2) on p.~27 of the text. \pause
%    \item[]
    \item The standard error of $\widehat{S}(t)$ is 
          $\widehat{S}(t)\sqrt{\sum_{t_j \leq t} \left(\frac{d_j}{n_j(n_j-d_j)}\right)}$. \pause
    \item In R's \texttt{survival} package, the default confidence interval for the Kaplan-Meier estimate uses this standard error.
  \end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{Counting Processes}
\framesubtitle{The theoretical state of the art} \pause
  \begin{itemize}
    \item Distribution theory for the Kaplan Meier estimate (asymptotic normality, standard error etc.) has been presented the way it was originally developed. 
    \item The derivation is partly sound, but it has some holes. \pause
    \item More recently, viewing number of failures up to a point as a counting process (stochastic processes, STA348 and beyond)  has cleaned the whole thing up. \pause
    \item Results are the same, but now the proofs are rigorous. \pause
    \item There was some guesswork in the development of these ideas, but the main guesses were right. 
  \end{itemize}
\end{frame}

% 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{frame}
\frametitle{Copyright Information}

This slide show was prepared by  \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner},
Department of Statistics, University of Toronto. It is licensed under a 
\href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US}
     {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website:
\href{http://www.utstat.toronto.edu/brunner/oldclass/312f23} {\footnotesize \texttt{http://www.utstat.toronto.edu/brunner/oldclass/312f23}}

\end{frame}


\end{document}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{}\pause
%\framesubtitle{} 
  \begin{itemize}
    \item \pause
    \item \pause
    \item 
  \end{itemize}
\end{frame}

{\LARGE
\begin{displaymath}
    \log \widehat{S}(t) = \sum_{t_j \leq t} \log \widehat{p}_j
\end{displaymath}  \pause
} % End size


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


% I like this but it's not implemented in the survival package.
\begin{frame}
\frametitle{Log Survival Function }
\framesubtitle{That is, the log \emph{estimated} survival function}  \pause
\begin{displaymath}
    \log \widehat{S}(t) = \sum_{t_j \leq t} \log \widehat{p}_j \stackrel{.}{\sim}
    N\left( \log S(t) , \sum_{t_j \leq t}  \frac{1-p_j}{n_jp_j} \right)
\end{displaymath} \pause
  \begin{itemize}
    \item For any fixed $t$, we can get a point estimate and a confidence interval for $\log S(t)$. \pause
    \item Convert to a CI for $S(t)$ by taking the exponential function. \\ \pause
    $0.95 \approx P\{A \leq \log S(t) \leq B \} \pause = P\{e^A \leq S(t) \leq e^B \} $. \pause
    \item For the standard error of $\log \widehat{S}(t)$, estimate $p_j$ with $\widehat{p}_j = \frac{n_j-d_j}{n_j}$. \pause
    \item Estimated variance of $\log \widehat{S}(t)$ is $\sum_{t_j \leq t} \left(\frac{d_j}{n_j(n_j-d_j)}\right)$. \pause
    \item This is expression 3.1.1 on p.~27 of the text. \pause
    \item Standard error of  $\log \widehat{S}(t)$ is 
          $\sqrt{\sum_{t_j \leq t} \left(\frac{d_j}{n_j(n_j-d_j)}\right)}$.
  \end{itemize}
  
\end{frame}