% \documentclass[serif]{beamer} % Serif for Computer Modern math font. \documentclass[serif, handout]{beamer} % Handout mode to ignore pause statements \hypersetup{colorlinks,linkcolor=,urlcolor=red} \usefonttheme{serif} % Looks like Computer Modern for non-math text -- nice! \setbeamertemplate{navigation symbols}{} % Suppress navigation symbols % \usetheme{Berlin} % Displays sections on top % \usetheme{Frankfurt} % Displays section titles on top: Fairly thin but still swallows some material at bottom of crowded slides %\usetheme{Berkeley} \usetheme{AnnArbor} % CambridgeUS % I'm using this one (yellow) just to be different from Dehan. \usepackage{comment} \usepackage[english]{babel} \usepackage{amsmath} % for binom \usepackage{tikz} % \usepackage{graphicx} % To include pdf files! % \definecolor{links}{HTML}{2A1B81} % \definecolor{links}{red} \setbeamertemplate{footline}[frame number] \mode \title{Exploratory Factor Analysis\footnote{See last slide for copyright information.}} \subtitle{STA431 Spring 2023} \date{} % To suppress date \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Factor Analysis: The Measurement Model} \vspace{-4mm} {\Large \begin{displaymath} \mathbf{d}_i = \boldsymbol{\Lambda}\mathbf{F}_i + \mathbf{e}_i \end{displaymath} } \begin{center} \includegraphics[width=2.5in]{efa} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Example with 2 factors and 8 observed variables} \begin{equation*} \begin{array}{cccccc} \mathbf{d}_i & = & \boldsymbol{\Lambda} & \mathbf{F}_i & + & \mathbf{e}_i \\ &&&&& \\ \left( \begin{array}{c} d_{i,1} \\ d_{i,2} \\ d_{i,3} \\ d_{i,4} \\ d_{i,5} \\ d_{i,6} \\ d_{i,7} \\ d_{i,8} \end{array} \right) & = & \left( \begin{array}{c c} \lambda_{11} & \lambda_{12} \\ \lambda_{21} & \lambda_{22} \\ \lambda_{31} & \lambda_{32} \\ \lambda_{41} & \lambda_{42} \\ \lambda_{51} & \lambda_{52} \\ \lambda_{61} & \lambda_{62} \\ \lambda_{71} & \lambda_{27} \\ \lambda_{81} & \lambda_{82} \\ \end{array} \right) & \left(\begin{array}{c} F_{i,1} \\ F_{i,2} \end{array}\right) & + & \left(\begin{array}{c} e_{i,1} \\ e_{i,2} \\ e_{i,3} \\ e_{i,4} \\ e_{i,5} \\ e_{i,6} \\ e_{i,7} \\ e_{i,8} \end{array}\right) . \end{array} \end{equation*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Terminology} %\framesubtitle{} %{\LARGE \begin{eqnarray*} d_{i,1} &=& \lambda_{11} F_{i,1} + \lambda_{12} F_{i,2} + e_{i,1} \\ d_{i,2} &=& \lambda_{21} F_{i,1} + \lambda_{22} F_{i,2} + e_{i,2} \mbox{ ~etc.} \end{eqnarray*} %} % End size \begin{itemize} \item The lambda values are called \emph{factor loadings}. \item $F_1$ and $F_2$ are sometimes called \emph{common factors}, because they influence all the observed variables. \item Error terms $e_1, \ldots, e_8$ are sometimes called \emph{unique factors}, because each one influences only a single observed variable. \item The factors are latent variables. \item $d_{ij}$ are observable variables. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Two kinds of factor analysis} %\framesubtitle{} \begin{itemize} \item \textbf{Exploratory}: : The goal is to describe and summarize the data by explaining a large number of observed variables in terms of a smaller number of latent variables (factors). The factors are the reason the observable variables have the correlations they do. Arrows from all factors to all observable variables. \item \textbf{Confirmatory}: Estimation and hypothesis testing as usual. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Unconstrained \textbf{Exploratory} Factor Analysis} \framesubtitle{Arrows from all factors to all observed variables, factors correlated} \begin{center} \includegraphics[width=2.5in]{efa} \end{center} \end{frame} \begin{comment} * Arrows from all factors to all observed variables * Of course can have lots of factors: 16 pf * Even when factors are uncorrelated, it’s massively unidentified. * Infinitely many VERY different sets of parameter values can yield the same covariance matrix for the observed variables, same distribution of data if normal. * Reasonable, been going on for around 100 years, and completely DOOMED TO FAILURE as a method of statistical estimation. \end{comment} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Model: $\mathbf{d} = \boldsymbol{\Lambda}\mathbf{F}+\mathbf{e}$} %\framesubtitle{} {\Large \begin{eqnarray*} cov(\mathbf{F}) &=& \boldsymbol{\Phi} \\ cov(\mathbf{e}) &=& \boldsymbol{\Omega} \mbox{ (usually diagonal)} \\ && \mathbf{F} \mbox{ and } \mathbf{e} \mbox{ independent (multivariate normal)} \\ cov(\mathbf{d}) &=& \boldsymbol{\Sigma} = \boldsymbol{\Lambda\Phi\Lambda}^\top + \boldsymbol{\Omega} \end{eqnarray*} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Identifiability} %\framesubtitle{} {\LARGE \begin{eqnarray*} \boldsymbol{\Lambda\Phi\Lambda}^\top + \boldsymbol{\Omega} & = & \boldsymbol{\Lambda \, \Phi}^{1/2} \mathbf{I}\boldsymbol{\Phi}^{1/2} \, \boldsymbol{\Lambda}^\top + \boldsymbol{\Omega} \\ \pause & = & (\boldsymbol{\Lambda \Phi}^{1/2}) \mathbf{I} (\boldsymbol{\Phi}^{1/2\top} \boldsymbol{\Lambda}^\top) + \boldsymbol{\Omega} \\ \pause & = & (\boldsymbol{\Lambda \Phi}^{1/2}) \mathbf{I} (\boldsymbol{\Lambda \Phi}^{1/2})^\top + \boldsymbol{\Omega} \\ \pause & = & \boldsymbol{\Lambda}_2 \mathbf{I} \boldsymbol{\Lambda}_2^\top + \boldsymbol{\Omega} \end{eqnarray*} \pause } % End size $(\boldsymbol{\Phi}, \boldsymbol{\Lambda}, \boldsymbol{\Omega})$ and $(\mathbf{I}, \boldsymbol{\Lambda}_2, \boldsymbol{\Omega})$ yield the same $\boldsymbol{\Sigma}$. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{It's worse than that} %\framesubtitle{} Let $\mathbf{Q}$ be an arbitrary positive definite covariance matrix for $\mathbf{F}_i$. \pause {\LARGE \begin{eqnarray*} \boldsymbol{\Sigma} & = & \boldsymbol{\Lambda}_2 \mathbf{I} \boldsymbol{\Lambda}_2^\top + \boldsymbol{\Omega} \nonumber \\ \pause &=& \boldsymbol{\Lambda}_2 \mathbf{Q}^{-\frac{1}{2}} \mathbf{Q} \mathbf{Q}^{-\frac{1}{2}} \boldsymbol{\Lambda}_2^\top + \boldsymbol{\Omega} \nonumber \\ \pause &=& (\boldsymbol{\Lambda}_2 \mathbf{Q}^{-\frac{1}{2}}) \mathbf{Q} (\mathbf{Q}^{-\frac{1}{2}\top} \boldsymbol{\Lambda}_2^\top) + \boldsymbol{\Omega} \nonumber \\ \pause &=& (\boldsymbol{\Lambda}_2 \mathbf{Q}^{-\frac{1}{2}}) \mathbf{Q} (\boldsymbol{\Lambda}_2 \mathbf{Q}^{-\frac{1}{2}})^\top + \boldsymbol{\Omega} \nonumber \\ \pause &=& \boldsymbol{\Lambda}_3 \mathbf{Q} \boldsymbol{\Lambda}_3^\top + \boldsymbol{\Omega} \end{eqnarray*} \pause } % End size So by adjusting the factor loadings, the covariance matrix of the factors could be \emph{anything}. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Parameters are not identifiable} %\framesubtitle{} \begin{itemize} \item The parameters of the general measurement model are not identifiable without some restrictions on the possible values of the parameter matrices. \item Notice that the general unrestricted model could be very close to the truth. But the parameters cannot be estimated successfully, period. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Solution: Restrict the model} %\framesubtitle{} {\LARGE \begin{displaymath} \boldsymbol{\Lambda\Phi\Lambda}^\top = \boldsymbol{\Lambda}_2 \mathbf{I} \boldsymbol{\Lambda}_2^\top \end{displaymath} } \begin{itemize} \item Fix $\boldsymbol{\Phi} = \mathbf{I}$. \pause \item All the factors are standardized, as well as independent. \item Justify this on the grounds of simplicity. \item Say the factors are ``orthogonal" (at right angles, uncorrelated). \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Standardize the observed variables too} \framesubtitle{This is one version} %\framesubtitle{} \begin{itemize} \item For $j = 1, \ldots, k$ and independently for $i=1, \ldots,n$, \begin{displaymath} z_{ij} = \frac{d_{ij}-\mu_j}{\sigma{jj}} \end{displaymath} \item Each observed variable has variance one as well as mean zero. \item $\boldsymbol{\Sigma}$ is now a correlation matrix. \item Base inference on the sample correlation matrix. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Standardized Exploratory Factor Analysis Model} \framesubtitle{Implicitly for $i=1, \ldots,n$} {\LARGE \begin{equation*} \mathbf{z} = \boldsymbol{\Lambda}\mathbf{F} + \mathbf{e} \end{equation*} } % End size where \begin{itemize} \item $\mathbf{z}$ is a $k \times 1$ observable random vector. Each element of $\mathbf{z}$ has expected value zero and variance one. \item $\boldsymbol{\Lambda}$ is a $k \times p$ matrix of constants. \item $\mathbf{F}$ ($F$ for factor) is a $p \times 1$ latent random vector with expected value zero and covariance matrix $\mathbf{I}_p$. \item The $k \times 1$ vector of error terms $\mathbf{e}$ has expected value zero and covariance matrix $\boldsymbol{\Omega}$, which is diagonal. \item $\mathbf{F}$ and $\mathbf{e}$ are independent \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Factor Loadings are Correlations} %\framesubtitle{} \begin{eqnarray*} corr(\mathbf{z},\mathbf{F}) & = & cov(\mathbf{z},\mathbf{F}) \pause \\ & = & cov(\boldsymbol{\Lambda}\mathbf{F} + \mathbf{e},\mathbf{F}) \pause \\ & = & \boldsymbol{\Lambda}cov(\mathbf{F},\mathbf{F}) + cov(\mathbf{e},\mathbf{F}) \pause \\ & = & \boldsymbol{\Lambda}cov(\mathbf{F}) + \mathbf{0} \pause \\ & = & \boldsymbol{\Lambda}\mathbf{I} \pause \\ & = & \boldsymbol{\Lambda} \end{eqnarray*} \pause \begin{itemize} \item The correlation between observed variable $i$ and factor $j$ is $\lambda_{ij}$. \item The square of $\lambda_{ij}$ is the reliability of observed variable $i$ as a measure of factor $j$. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{$\mathbf{z} = \boldsymbol{\Lambda}\mathbf{F} + \mathbf{e}$} %\framesubtitle{} % In scalar form, \begin{eqnarray*} z_1 & = & \lambda_{11}F_1 + \lambda_{12}F_2 + \cdots + \lambda_{1p}F_p + e_1 \\ z_2 & = & \lambda_{21}F_1 + \lambda_{22}F_2 + \cdots + \lambda_{2p}F_p + e_2 \\ \vdots & & \hspace{20mm} \vdots \\ z_k & = & \lambda_{k1}F_1 + \lambda_{k2}F_2 + \cdots + \lambda_{kp}F_p + e_k \end{eqnarray*} \pause \begin{eqnarray*} Var(z_1) & = & \lambda_{11}^2 + \lambda_{12}^2 + \cdots + \lambda_{1p}^2 + \omega_1 \\ Var(z_2) & = & \lambda_{21}^2 + \lambda_{22}^2 + \cdots + \lambda_{2p}^2 + \omega_2 \\ \vdots & & \hspace{22mm} \vdots \\ Var(z_k) & = & \lambda_{k1}^2 + \lambda_{k2}^2 + \cdots + \lambda_{kp}^2 + \omega_k \end{eqnarray*} \pause $Var(z_j)=1$, so $\omega_j = 1-\lambda_{j1}^2 - \lambda_{j2}^2 - \cdots - \lambda_{jp}^2$ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Communality and Uniqueness} \framesubtitle{$Var(z_j) = \lambda_{j1}^2 + \lambda_{j2}^2 + \cdots + \lambda_{jp}^2 + \omega_j = 1$} \pause \begin{itemize} \item The explained variance in $z_j$ is $\lambda_{j1}^2 + \lambda_{j2}^2 + \cdots + \lambda_{jp}^2$. It is called the \emph{communality}. \item To get the communality, add the squared factor loadings in row $j$ of $\boldsymbol{\Lambda}$. \pause \item $\omega_j = 1-\lambda_{j1}^2 - \lambda_{j2}^2 - \cdots - \lambda_{jp}^2$ is called the \emph{uniqueness}. It's the proportion of variance that is \emph{not} explained by the factors. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{If we could estimate the factor loadings} %\framesubtitle{} \begin{itemize} \item We could estimate the correlation of each observable variable with each factor. \item We could easily estimate reliabilities. \item We could assess how much of the variance in each observable variable comes from each factor. \item This could reveal what the underlying factors are, and what they mean. \end{itemize} \pause Unfortunately, we still can't estimate the factor loadings. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Rotation Matrices} %\framesubtitle{} \begin{itemize} \item Have a co-ordinate system in terms of $\vec{i}$, $\vec{j}$ orthonormal vectors \item Rotate the axes through an angle $\theta$. \end{itemize} \begin{center} \begin{tikzpicture}[>=stealth, scale=2] % Draw original axes \draw[dashed, color=blue!50] (-1.5,0) -- (1.5,0); \draw[dashed, color=blue!50] (0,-1.5) -- (0,1.5); % Draw original basis vectors \draw[dashed, very thick, ->] (0,0) -- (1,0); \draw (1,0) node[above] {$i$}; \draw[dashed, very thick, ->] (0,0) -- (0,1); \draw (0,1) node[right] {$j$}; % Draw rotated axes \draw[color=blue!50] (0,0) -- (45:1.5); \draw[color=blue!50] (0,0) -- (225:1.5); \draw[color=blue!50] (0,0) -- (-45:1.5); \draw[color=blue!50] (0,0) -- (135:1.5); % Draw rotated basis vectors \draw[very thick, ->] (0,0) -- (0.707,0.707); % 1/sqrt(2) = 0.707 or so. \draw (0.707,0.707) node[right] {$i^\prime$}; % The other basis vector. This is nicer \draw[very thick, ->] (0,0) -- (135:1); % angle 135 degrees, length 1 \draw (-0.707,0.707) node[left] {$j^\prime$}; % Draw the arcs \draw [thick, red, ->] (4mm,0mm) arc (0:45:4mm); \draw (2mm,-0.4mm) node[above] {\small \color{red} $\theta$}; \draw [thick, red, ->] (0mm,4mm) arc (90:135:4mm); \end{tikzpicture} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Equations of Rotation} %\framesubtitle{} \begin{columns} \column{0.6\textwidth} % Repeating the picture \begin{tikzpicture}[>=stealth, scale=2] % Draw original axes \draw[dashed, color=blue!50] (-1.5,0) -- (1.5,0); \draw[dashed, color=blue!50] (0,-1.5) -- (0,1.5); % Draw original basis vectors \draw[dashed, very thick, ->] (0,0) -- (1,0); \draw (1,0) node[above] {$i$}; \draw[dashed, very thick, ->] (0,0) -- (0,1); \draw (0,1) node[right] {$j$}; % Draw rotated axes \draw[color=blue!50] (0,0) -- (45:1.5); \draw[color=blue!50] (0,0) -- (225:1.5); \draw[color=blue!50] (0,0) -- (-45:1.5); \draw[color=blue!50] (0,0) -- (135:1.5); % Draw rotated basis vectors \draw[very thick, ->] (0,0) -- (0.707,0.707); % 1/sqrt(2) = 0.707 or so. \draw (0.707,0.707) node[right] {$i^\prime$}; % The other basis vector. This is nicer \draw[very thick, ->] (0,0) -- (135:1); % angle 135 degrees, length 1 \draw (-0.707,0.707) node[left] {$j^\prime$}; % Draw the arcs \draw [thick, red, ->] (4mm,0mm) arc (0:45:4mm); \draw (2mm,-0.4mm) node[above] {\small \color{red} $\theta$}; \draw [thick, red, ->] (0mm,4mm) arc (90:135:4mm); \end{tikzpicture} \column{0.4\textwidth} If a point on the plane is denoted in terms of $\vec{i}$ and $\vec{j}$ by $(x,y)$, its position in terms of the rotated basis vectors is \begin{eqnarray*} x^\prime &=& ~~x\cos\theta + y\sin\theta \\ y^\prime &=& -x\sin\theta + y\cos\theta. \end{eqnarray*} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{In Matrix Form} %\framesubtitle{} The equations of rotation \begin{eqnarray*} x^\prime &=& ~~x\cos\theta + y\sin\theta \\ y^\prime &=& -x\sin\theta + y\cos\theta. \end{eqnarray*} May be written \begin{equation*} \left( \begin{array}{c} x^\prime \\ y^\prime \end{array} \right) = \left( \begin{array}{rr} \cos\theta & \sin\theta \\ -\sin\theta & \cos\theta \end{array} \right) \left( \begin{array}{c} x \\ y \end{array} \right) = \mathbf{R}\left( \begin{array}{c} x \\ y \end{array} \right). \end{equation*} \pause Using the identities $\cos(-\theta) = \cos\theta$ and $\sin(-\theta) = -\sin\theta$, rotate back through an angle of $-\theta$. \begin{equation*} \left( \begin{array}{c} x \\ y \end{array} \right) = \left( \begin{array}{rr} \cos\theta & -\sin\theta \\ \sin\theta & \cos\theta \end{array} \right) \left( \begin{array}{c} x^\prime \\ y^\prime \end{array} \right) = \mathbf{R}^\top\left( \begin{array}{c} x^\prime \\ y^\prime \end{array} \right). \end{equation*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Verifying that $\mathbf{R}^\top = \mathbf{R}^{-1}$} %\framesubtitle{} \begin{eqnarray*} \mathbf{R}\mathbf{R}^\top &=&\left( \begin{array}{rr} \cos\theta & \sin\theta \\ -\sin\theta & \cos\theta \end{array} \right) \left( \begin{array}{rr} \cos\theta & -\sin\theta \\ \sin\theta & \cos\theta \end{array} \right) \\ &=& \left( \begin{array}{rr} \cos^2\theta+\sin^2\theta & -\cos\theta\sin\theta + \sin\theta\cos\theta \\ -\sin\theta\cos\theta\ + \cos\theta\sin\theta & \sin^2\theta+\cos^2\theta \end{array} \right) \\ &=& \left( \begin{array}{rr} 1 & 0 \\ 0 & 1 \end{array} \right) = \mathbf{I}. \end{eqnarray*} \pause In higher dimension as well, pre-multiplication by an orthogonal matrix corresponds to a rotation or possibly a reflection. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Another source of non-identifiability} \framesubtitle{Returning to the standardized factor model} \begin{eqnarray*} cov(\mathbf{z}) &=& \boldsymbol{\Sigma} \\ &=& \boldsymbol{\Lambda}\boldsymbol{\Lambda}^\top + \boldsymbol{\Omega} \\ \pause &=& \boldsymbol{\Lambda} \mathbf{R}^\top\mathbf{R} \boldsymbol{\Lambda}^\top + \boldsymbol{\Omega} \\ \pause &=& (\boldsymbol{\Lambda}\mathbf{R}^\top) (\boldsymbol{\Lambda}\mathbf{R}^\top)^\top + \boldsymbol{\Omega} \\ \pause &=& \boldsymbol{\Lambda}_2\boldsymbol{\Lambda}_2^\top + \boldsymbol{\Omega} \end{eqnarray*} \pause Infinitely many rotation matrices produce the same $\boldsymbol{\Sigma}$, even though the factor loadings in $\boldsymbol{\Lambda}_2 = \boldsymbol{\Lambda}\mathbf{R}^\top$ can be very different for different $\mathbf{R}$ matrices. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Rotating the Factors} \framesubtitle{Recall $\boldsymbol{\Sigma} = \boldsymbol{\Lambda}\boldsymbol{\Lambda}^\top + \boldsymbol{\Omega} = \boldsymbol{\Lambda} \mathbf{R}^\top\mathbf{R} \boldsymbol{\Lambda}^\top + \boldsymbol{\Omega}$} Post-multiplication of $\boldsymbol{\Lambda}$ by $\mathbf{R}^\top$ is often called ``rotation of the factors." \pause \begin{eqnarray*} \mathbf{z} & = & \boldsymbol{\Lambda}\mathbf{F} + \mathbf{e} \pause \\ & = & (\boldsymbol{\Lambda} \mathbf{R}^\top) (\mathbf{R}\mathbf{F}) + \mathbf{e} \pause \\ & = & \boldsymbol{\Lambda}_2 \mathbf{F}^\prime + \mathbf{e}. \end{eqnarray*} \pause \begin{itemize} \item $\mathbf{F}^\prime = \mathbf{RF}$ is a set of \emph{rotated} factors. \item All rotations of the factors produce the same covariance matrix of the observable data. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Same Explained Variance} \framesubtitle{When factors are rotated} \begin{itemize} \item Communality of variable $i$ is $\sum_{j=1}^p \lambda_{ij}^2$. \item Add up the squares of the factor loadings in row $i$ of $\boldsymbol{\Lambda}$. \pause \item This equals the $ith$ diagonal of element of $\boldsymbol{\Lambda\Lambda}^\top$. \end{itemize} \pause \begin{eqnarray*} \boldsymbol{\Lambda}_2\boldsymbol{\Lambda}_2^\top & = & (\boldsymbol{\Lambda}\mathbf{R}^\top) (\boldsymbol{\Lambda}\mathbf{R}^\top)^\top \pause \\ & = & \boldsymbol{\Lambda}\mathbf{R}^\top \mathbf{R}\boldsymbol{\Lambda}^\top \pause \\ & = & \boldsymbol{\Lambda}\boldsymbol{\Lambda}^\top. \end{eqnarray*} Ouch. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Strategy} %\framesubtitle{} \begin{enumerate} \item Place some restrictions on the factor loadings, so that the only rotation matrix that preserves the restrictions is the identity matrix. For example, $\lambda_{ij} = 0$ for $j>i$. \item Generally, the restrictions may not make sense in terms of the data. Don't worry about it. \item Estimate the loadings, perhaps by maximum likelihood. \item All (orthogonal) rotations result in the same maximum value of the likelihood function. That is, the maximum is not unique. Again, don't worry about it. \item Pick a rotation that results in a simple pattern in the factor loadings, one that is easy to interpret. \end{enumerate} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Simple Structure} \framesubtitle{Something like this would be nice} {\LARGE \begin{displaymath} \boldsymbol{\Lambda} = \left( \begin{array}{rr} 0.87 & 0.00 \\ -0.95 & 0.03 \\ 0.79 & 0.00 \\ 0.00 & 0.88 \\ 0.01 & 0.75 \\ 0.02 & -0.94 \\ 0.00 & -0.82 \end{array} \right) \end{displaymath} } % End size \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Rotation to Simple Structure} \framesubtitle{Rotation means post-multiply $\boldsymbol{\Lambda}$ by a rotation matrix} \begin{itemize} \item Used to be subjective, and done by hand! \item Now it's objective and done by computer. \pause \item There are various criteria. They are all iterative, taking a number of steps to approach some criterion. \item The most popular rotation method is varimax rotation. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Varimax Rotation} %\framesubtitle{} \begin{itemize} \item The original idea was to maximize the variability of the \emph{squared} loadings in each column. \begin{displaymath} \boldsymbol{\Lambda} = \left( \begin{array}{rr} 0.87 & 0.00 \\ -0.95 & 0.03 \\ 0.79 & 0.00 \\ 0.00 & 0.88 \\ 0.01 & 0.75 \\ 0.02 & -0.94 \\ 0.00 & -0.82 \end{array} \right) \end{displaymath} \pause \item The results weren't great, so they fixed it up, expressing each squared factor loading as a proportion of the communality. \item Note that the criterion depends on the factor loadings only through the $\lambda_{ij}^2$. \pause \item In practice, varimax rotation tends to maximize the squared loading of each observable variable with \emph{just one underlying factor}. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Everybody Loves Varimax} %\framesubtitle{} \begin{itemize} \item Estimate the factor loadings with some crazy restrictions. \item Apply a varimax rotation. \item Interpret the results. \pause \end{itemize} \vspace{5mm} Note that rotation does not affect communalities (explained variance). \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{The Missing Ingredient: Number of Common Factors} % \framesubtitle{} \begin{itemize} \item Number of common factors is generally not known in advance. This is \emph{exploratory} factor analysis. \pause \item There are \emph{lots} of ideas and suggestions. \begin{itemize} \item At least three variables per factor. \item At least five variables per factor. \item \ldots \end{itemize} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Wisdom} \framesubtitle{From Kaiser (Mr. Varimax)} \begin{itemize} \item There are probably hundreds of common factors. \item Including them all in the model is out of the question. \item The objective should be to come up with a model that includes the most important ones, and captures the essence of what is going on. \item Simplicity is important. Other things being more or less equal, the fewer factors the better. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Estimating Number of Factors} \framesubtitle{The three most popular ideas?} \begin{itemize} \item Number of eigenvalues (of the sample correlation matrix) greater than one. \item Scree plots. \item Testing. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Scree Plots} %\framesubtitle{} \begin{itemize} \item In geology, ``scree" is the pile of rock and debris often found at the foot of a mountain cliff or volcano. \item Scree slopes tend to be concave up, steepest near the cliff and then tailing off. \pause \item In factor analysis, a scree plot shows the eigenvalues of the correlation matrix, sorted in order of magnitude. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Scree Plot of the Body-Mind Data} \framesubtitle{See textbook} \begin{columns} \column{0.5\textwidth} \includegraphics[width=2.5in]{screeplot} % This picture is taken from the textbook. See textbook for the R code. \column{0.5\textwidth} \pause \begin{itemize} \item It is very common for the graph to decrease rapidly at first, and then straighten out with a small negative slope for the rest of the way. \item The point at which the linear trend begins is the estimated number of factors. \end{itemize} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Testing} %\framesubtitle{} \begin{itemize} \item If the model is fit by maximum likelihood, carry out the likelihood ratio test for goodness of fit. \item If we really insist that the error terms are independent of the factors and have a diagonal covariance matrix, the only way that the model can be incorrect is that it does not have enough factors. \item Thus, any test for goodness of fit is also a test for number of factors. \pause \item So if a model fails the goodness of fit test, increase the number of factors and try again. \item However \ldots \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Can you ever have too much statistical power?} %\framesubtitle{} \begin{itemize} \item In reality, there are probably hundreds of factors. \item The power of the likelihood ratio test increases with the sample size \item For large samples, significant lack of fit may be expected for any model with a modest number of factors. \item Even if it's a good model. \pause \item So while formal testing for lack of fit may be useful, one should not rely on it exclusively. \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Consulting Advice} %\framesubtitle{} \begin{itemize} \item When a non-statistician claims to have done a ``factor analysis," ask what kind. \item Usually it was a principal components analysis. \item Principal components are linear combinations of the observed variables. They come from the observed variables by direct calculation. \item In true factor analysis, it’s the observed variables that arise from the factors. \item So principal components analysis is kind of like backwards factor analysis, though the spirit is similar \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Copyright Information} This slide show was prepared by \href{http://www.utstat.toronto.edu/~brunner}{Jerry Brunner}, Department of Statistical Sciences, University of Toronto. It is licensed under a \href{http://creativecommons.org/licenses/by-sa/3.0/deed.en_US} {Creative Commons Attribution - ShareAlike 3.0 Unported License}. Use any part of it as you like and share the result freely. The \LaTeX~source code is available from the course website: \href{http://www.utstat.toronto.edu/~brunner/oldclass/431s23} {\small\texttt{http://www.utstat.toronto.edu/brunner/oldclass/431s23}} \end{frame} \end{document}