% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usetheme{AnnArbor} % CambridgeUS % I'm using this one (yellow) just to be different from Dehan. \usepackage{comment} \usepackage[english]{babel} \usepackage{amsmath} % for binom % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Principal Components\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Principal Components} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Principal Components Analysis is \emph{not} Factor Analysis} %\framesubtitle{} \begin{itemize} \item Factor analysis is the measurement model: $\mathbf{d} = \boldsymbol{\Lambda}\mathbf{F} + \mathbf{e}$. \vspace{2mm} \begin{picture}(20,20)(0,0) % \put(0,0){\circle{10}} \put(50,0){\circle{30}} \put(47,-3){$\mathbf{F}$} \put(150,-10){\framebox(25,25){$\mathbf{d}$}} \put(67,0){\vector(1,0){80}} \put(250,-2){$\mathbf{e}$} \put(245,0){\vector(-1,0){67}} \end{picture} \pause \vspace{5mm} \item Principal components are observable linear combinations: $\mathbf{y} = \mathbf{C}^\top \mathbf{d}$. \vspace{4mm} \begin{picture}(20,20)(0,-10) \put(40,-10){\framebox(25,25){$\mathbf{d}$}} \put(150,-10){\framebox(25,25){$\mathbf{y}$}} \put(67,0){\vector(1,0){80}} \end{picture} \vspace{2mm} \pause \item Still, principal components and factor analysis have notable similarities and are frequently confused. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Data Reduction} %\framesubtitle{} \begin{itemize} \item Suppose you have a large number of variables that are correlated with one another. \item Principal components analysis allows you to find a smaller set of linear combinations of the variables. \item There linear combinations may contain most of the variation in the original set. \item Use a few linear combinations in place of the entire data set. \end{itemize} \end{frame} % It may be that little is lost by using the linear combinations in place of the original variables, and there can be substantial advantages in terms of storage and processing. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Our Version} \framesubtitle{Standardized} \begin{itemize} \item There are $k$ observable variables, standardized: $z_j = \frac{x_j-\mu_j}{\sigma_j}$. \pause \item $E(\mathbf{z}) = \mathbf{0}$, and $cov(\mathbf{z}) = \boldsymbol{\Sigma}$, a correlation matrix. \pause \item $\boldsymbol{\Sigma} = \mathbf{CDC}^\top$ \pause \item[] \item $\mathbf{y} = \mathbf{C}^\top\mathbf{z}$ are the \emph{principal components} of $\mathbf{z}$. \item A set of $k$ linear combinations. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Rotation} %\framesubtitle{} \begin{itemize} \item Because $\mathbf{CC}^\top = \mathbf{I}$, $\mathbf{C}$ and $\mathbf{C}^{\top}$ are \emph{orthogonal} matrices. \item Geometrically, multiplying a point by an orthogoanal matrix gives the location of the point in a new co-ordinate axis system, where the original axes have been \emph{rotated}. \pause \item For the multivariate normal, contours of constant probability density are ellipsoids. \pause \item In principal components, the axes of the new co-ordinate system line up with the principal axes of the ellipsoids. \end{itemize} % Normality nice for interpretation but not necessary, and we will return to rotations later. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Mean and Covariance Matrix} \framesubtitle{Of principal components $\mathbf{y} = \mathbf{C}^\top\mathbf{z}$} $E(\mathbf{y}) = \mathbf{0}$, and \begin{eqnarray*} cov(\mathbf{y}) & = & \pause cov(\mathbf{C}^\top\mathbf{z}) \\ \pause & = & \mathbf{C}^\top cov(\mathbf{z})\mathbf{C} \\ \pause & = & \mathbf{C}^\top \boldsymbol{\Sigma}\mathbf{C} \\ \pause & = & \mathbf{C}^\top\mathbf{C \, D \, C}^\top\mathbf{C} \\ \pause & = & \mathbf{D} \end{eqnarray*} \pause So covariances of the principal components are all zero, and their variances are the eigenvalues. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\mathbf{y} = \mathbf{C}^\top\mathbf{z} \iff \mathbf{z} = \mathbf{C}\mathbf{y}$} \pause %\framesubtitle{} In scalar form, \begin{eqnarray*} z_1 & = & c_{11}y_1 + c_{12}y_2 + \cdots + c_{1k}y_k \\ z_2 & = & c_{21}y_1 + c_{22}y_2 + \cdots + c_{2k}y_k \\ \vdots & & \hspace{20mm} \vdots \\ z_k & = & c_{k1}y_1 + c_{k2}y_2 + \cdots + c_{kk}y_k. \end{eqnarray*} \pause So because the elements of $\mathbf{y}$ are uncorrelated, \begin{eqnarray*} Var(z_j) & = & Var(c_{j1}y_1 + c_{j2}y_2 + \cdots + c_{jk}y_k) \\ \pause & = & c_{j1}^2 Var(y_1) + c_{j2}^2 Var(y_2) + \cdots + c_{jk}^2 Var(y_k) \\ \pause & = & c_{j1}^2 \lambda_1 + c_{j2}^2 \lambda_2 + \cdots + c_{jk}^2 \lambda_k \pause = 1. \end{eqnarray*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Components of Variance} %\framesubtitle{} From \begin{eqnarray*} Var(z_j) & = & Var(c_{j1}y_1 + c_{j2}y_2 + \cdots + c_{jk}y_k) \\ & = & c_{j1}^2 Var(y_1) + c_{j2}^2 Var(y_2) + \cdots + c_{jk}^2 Var(y_k) \\ & = & c_{j1}^2 \lambda_1 + c_{j2}^2 \lambda_2 + \cdots + c_{jk}^2 \lambda_k = 1. \end{eqnarray*} we see \pause \begin{itemize} \item The variance of $z_j$ is decomposed into the part explained by $y_1$, the part explained by $y_2$, and so on. \item Specifically, $y_1$ explains $c_{j1}^2 \lambda_1$ of the variance, $y_2$ explains $c_{j2}^2 \lambda_2$ of the variance, etc.. \pause \item Because $z_j$ is standardized, these are \emph{proportions} of variance. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Squared Correlations} %\framesubtitle{} Using the fact that $cov(y_i,y_j)=0$ for $i \neq j$, \pause \begin{eqnarray*} Cov(z_i,y_j) & = & Cov(c_{i1}y_1 + c_{i2}y_2 + \cdots + c_{ij}{\color{blue} y_j } + \cdots + c_{jk}y_k {\color{red},} ~ {\color{blue} y_j)} \\ \pause & = & c_{ij}Cov(y_j,y_j) \\ \pause & = & c_{ij}\lambda_j. \pause \end{eqnarray*} Then, \begin{eqnarray*} Corr(z_i,y_j) & = & \frac{Cov(z_i,y_j)}{SD(z_i)SD(y_j)} \nonumber \\ \pause & = & \frac{c_{ij}\lambda_j}{1~\sqrt{\lambda_j}} \pause = c_{ij}\sqrt{\lambda_j}, \end{eqnarray*} \pause and the \emph{squared} correlation between $z_i$ and $y_j$ is $c_{ij}^2 \lambda_j$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Squared correlation between $z_i$ and $y_j$ is $c_{ij}^2 \lambda_j$} \framesubtitle{And using $Var(z_j) = c_{j1}^2 \lambda_1 + c_{j2}^2 \lambda_2 + \cdots + c_{jk}^2 \lambda_k$} \begin{eqnarray*} Var(z_1) & = & c_{11}^2\lambda_1 + c_{12}^2\lambda_2 + \cdots + c_{1k}^2\lambda_k \nonumber \\ Var(z_2) & = & c_{21}^2\lambda_1 + c_{22}^2\lambda_2 + \cdots + c_{2k}^2\lambda_k \\ \vdots & & \hspace{20mm} \vdots \nonumber \\ Var(z_k) & = & c_{k1}^2\lambda_1 + c_{k2}^2\lambda_2 + \cdots + c_{kk}^2\lambda_k. \nonumber \end{eqnarray*} \pause \vspace{5mm} The pieces of variance being added up are the squared correlations between the original variables and the principal components. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A Matrix of Squared Correlations} \framesubtitle{Components of Variance} {\small Element $i,j$ is $Corr(z_i,y_j)^2$ \begin{center} \begin{tabular}{c||c|c|c|c|} & $y_1$ & $y_1$ & $\cdots$ & $y_k$ \\ \hline \hline $z_1$ & $c_{11}^2\lambda_1$ & $c_{12}^2\lambda_2$ & $\cdots$ & $c_{1k}^2\lambda_k$ \\ \hline $z_2$ & $c_{21}^2\lambda_1$ & $c_{22}^2\lambda_2$ & $\cdots$ & $c_{2k}^2\lambda_k$ \\ \hline $\vdots$ & $\vdots$ & $\vdots$ & $\ddots$ & $\vdots$ \\ \hline $z_k$ & $c_{k1}^2\lambda_1$ & $c_{k2}^2\lambda_2$ & $\cdots$ & $c_{kk}^2\lambda_k$ \\ \hline \end{tabular} \end{center} \pause \begin{itemize} \item If you add the entries in any row, you get one. \item Adding the entries in a column yields the total amount of variance in the original variables that is explained by that principal component. \pause \item The sum of entries in column $j$ is \begin{eqnarray*} \label{colsumsqcorr} \sum_{i=1}^k c_{ij}^2 \lambda_j & = & \lambda_j \sum_{i=1}^k c_{ij}^2 \nonumber \\ & = & \lambda_j \cdot 1 = \lambda_j \end{eqnarray*} \end{itemize} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Meaning of the Eigenvalues of $\boldsymbol{\Sigma}$} %\framesubtitle{} \begin{center} \renewcommand{\arraystretch}{1.5} \begin{tabular}{c||c|c|c|c|} & $y_1$ & $y_1$ & $\cdots$ & $y_k$ \\ \hline \hline $z_1$ & $c_{11}^2\lambda_1$ & $c_{12}^2\lambda_2$ & $\cdots$ & $c_{1k}^2\lambda_k$ \\ \hline $z_2$ & $c_{21}^2\lambda_1$ & $c_{22}^2\lambda_2$ & $\cdots$ & $c_{2k}^2\lambda_k$ \\ \hline $\vdots$ & $\vdots$ & $\vdots$ & $\ddots$ & $\vdots$ \\ \hline $z_k$ & $c_{k1}^2\lambda_1$ & $c_{k2}^2\lambda_2$ & $\cdots$ & $c_{kk}^2\lambda_k$ \\ \hline \multicolumn{1}{c}{~} & \multicolumn{1}{c}{\color{red}$\lambda_1$} & \multicolumn{1}{c}{\color{red}$\lambda_2$} & \multicolumn{1}{c}{\color{red}$\cdots$} & \multicolumn{1}{c}{\color{red}$\lambda_k$} \end{tabular} \renewcommand{\arraystretch}{1.0} \end{center} \pause \vspace{4mm} The eigenvalues are both the variances of the principal components and the amounts of variance in the original variables that are explained by the respective principal components. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{It gets better} %\framesubtitle{} A theorem says \begin{itemize} \item $y_1$ has the greatest possible variance of any linear combination whose squared weights add up to one. \pause \item $y_2$ is the linear combination that has the greatest variance subject to the constraints that it's orthogonal to $y_1$ and its squared weights add to one. \pause \item $y_3$ is the linear combination that has the greatest variance subject to the constraints that it's orthogonal to $y_1$ and $y_2$, and its squared weights add to one. \pause \item And so on. \item It's a kind of optimality. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Data reduction} %\framesubtitle{} \begin{itemize} \item If the correlations among the original variables are substantial, the first few eigenvalues will be relatively large. \item The data reduction idea is to retain only the first several principal components, the ones that contain most of the variation in the original variables. \item The expectation is that they will capture most of the \emph{meaningful} variation. \pause \item Conventional choice is to retain components with eigenvalues greater than one. % \item Zero or near-zero eigenvalues are a bonus. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Sample Principal Components} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sample Principal Components} %\framesubtitle{} \begin{itemize} \item Of course we don't know $\boldsymbol{\Sigma}$, and we don't know means and standard deviations to standardize. \item So use the sample versions. \pause \item[] \item $\mathbf{Z}$ is an $n \times k$ matrix of standardized variables. \item Independent (almost independent) random vectors are \emph{row} vectors. \pause \item Let $\mathbf{Y} = \mathbf{Z}\widehat{\mathbf{C}}$. Rows are sample principal components. \pause \item All formulas apply to sample principal components, provided we use $n$ in the denominators and not $n-1$. \pause \item[] \item Principal components regression. \end{itemize} \end{frame} % \begin{comment} \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s23} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}} \end{frame} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} %\framesubtitle{} \begin{itemize} \item \item \item \end{itemize} \end{frame} {\LARGE \begin{displaymath} \end{displaymath} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%