# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 431s23code.txt: R code from lecture. The only explanation is in the comment statements. # Last revised March 8, 2023 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 2. More Linear Algebra # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # is.matrix(3) # Is the number 3 a 1x1 matrix? treecorr = cor(trees); treecorr is.matrix(treecorr) # Bind rows of a matrix together A = rbind( c(3, 2, 6,8), c(2,10,-7,4), c(6, 6, 9,1) ); A # Transpose t(A) # U = A A' (3x3), V = A' A (4x4) U = A %*% t(A) V = t(A) %*% A; V # U = A A' (3x3), V = A' A (4x4) # So rank(V) cannot exceed 3 and det(V)=0 det(U); det(V) # Recall U = A A' (3x3), V = A' A (4x4) solve(U) solve(V) # Recall U = A A' (3x3), V = A' A (4x4) eigen(U) eigen(V) eigenV = eigen(V) C = eigenV$vectors; D = diag(eigenV$values); D # C is an orthoganal matrix C %*% t(C) V; C %*% D %*% t(C) sqrtV = C %*% sqrt(D) %*% t(C) # Multiply to get V sqrtV %*% sqrtV; V D; sqrt(D) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 3. Random Vectors and Multivariate Normal # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt") A = rbind(c(1.0,0.5), c(0.5,1.0)) A datta = rmvn(10,mu=c(0,0),sigma=A); datta # rmvn: Simulate from multivariate normal rmvn <- function(nn,mu,sigma) # Returns an nn by kk matrix, rows are independent MVN(mu,sigma) { kk <- length(mu) dsig <- dim(sigma) if(dsig[1] != dsig[2]) stop("Sigma must be square.") if(dsig[1] != kk) stop("Sizes of sigma and mu are inconsistent.") ev <- eigen(sigma) if(min(eigen(sigma)$values) < 0) stop("Sigma must have non-negative eigenvalues.") sqrl <- diag(sqrt(ev$values)) PP <- ev$vectors ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn) out <- t(PP%*%sqrl%*%ZZ+mu) return(out) }# End of function rmvn # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 4. Estimation # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Gamma MLE rm(list=ls()) # Simulate data set.seed(3201); n = 50; alpha=2; beta=3 d = round(rgamma(50,shape=alpha, scale=beta),2); d mean(d); alpha*beta # Compare var(d); alpha*beta^2 gmll = function(theta,datta) { aa = theta[1]; bb = theta[2] nn = length(datta); sumd = sum(datta) sumlogd = sum(log(datta)) value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd return(value) } # End function gmll # help(optim) # MOM for starting values momalpha = mean(d)^2/var(d); momalpha mombeta = var(d)/mean(d); mombeta # Error message says: "Bounds can only be used with method # L-BFGS-B (or Brent)" gsearch = optim(par=c(momalpha,mombeta), fn = gmll, method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d) gsearch thetahat = gsearch$par names(thetahat) = c("alpha-hat","beta-hat"); thetahat # Second derivative test eigen(gsearch$hessian)$values ############## Later ############## # Had thetahat = gsearch$par thetahat # Asymptotic variance-covariance matrix is the inverse of the # Fisher Information Vhat_n = solve(gsearch$hessian); Vhat_n # Confidence interval for alpha (true value is 2) thetahat se_alphahat = sqrt(Vhat_n[1,1]) lower95 = thetahat[1] - 1.96*se_alphahat upper95 = thetahat[1] + 1.96*se_alphahat c(lower95,upper95) ######################################################################### # Picture of convergence in probability, using R # Layer 1 rm(list=ls()) x = seq(from=-3,to=3,by=0.01) sigma=1/2; raise=0.25; z=1.3 Density = dnorm(x,mean=0,sd=sigma)+raise plot(x,Density,axes=F,xlab='',ylab='',pch=' ') lines(x,dnorm(x)+raise,lty=2) lines(c(-3,3),c(raise,raise),lty=1) text(x=0,y=raise-0.015,expression(theta)) text(x=-z+.20,y=raise-0.015,expression(paste("(",theta-epsilon))) text(x= z-.20,y=raise-0.015,expression(paste(theta+epsilon,")"))) lines(c(-z,-z),c(raise,raise+dnorm(-z)),lty=2) lines(c(z,z),c(raise,raise+dnorm(z)),lty=2) text(x=0,y=dnorm(0,sd=sigma)+raise,".") # Guide for cropping # Save this picture as convergence1.pdf, and proceed ######################################################################### # Layer 2 lines(x,Density,lty=1) lines(c(-z,-z),c(raise,raise+dnorm(-z,sd=sigma)),lty=1) lines(c(z,z),c(raise,raise+dnorm(z,sd=sigma)),lty=1) # Save as convergence2.pdf # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 5. Testing Hypotheses # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h # Tn is estimated theta, usually a vector. # Vn is the estimated asymptotic covariance matrix of Tn. # For Wald tests based on numerical MLEs, Tn = theta-hat, # and Vn is the inverse of the Hessian of the minus log # likelihood. { Wtest = numeric(3) names(Wtest) = c("W","df","p-value") r = dim(L)[1] W = t(L%*%Tn-h) %*% solve(L%*%Vn%*%t(L)) %*% (L%*%Tn-h) W = as.numeric(W) pval = 1-pchisq(W,r) Wtest[1] = W; Wtest[2] = r; Wtest[3] = pval Wtest } # End function Wtest # Plotting the parameter space pdf(file = "ParameterSpace.pdf") # Plot the parameter space x = seq(from=.01,to=5,length=20); y=x plot(x,y,type='l',xlab=expression(alpha), ylab=expression(beta)) title('Parameter Space') dev.off() ############ Saved in working directory ############ pdf(file = "MLEplot.pdf") # Plot the MLEs x = seq(from=.01,to=5,length=20); y=x plot(x,y,type='l',xlab=expression(alpha), ylab=expression(beta)) points(thetahat[1],thetahat[2], pch=19, col = "red1") # Unrestricted MLE points(alphahat0,alphahat0, pch=23, bg="black") # Restricted MLE title('Red point at unrestricted MLE, Black point at restricted MLE') dev.off() ############ Saved in working directory ############ # Gamma: testing H0: alpha = beta rm(list=ls()) # Simulate data set.seed(3201); n = 50; alpha=2; beta=3 d = round(rgamma(50,shape=alpha, scale=beta),2); d gmll = function(theta,datta) { aa = theta[1]; bb = theta[2] nn = length(datta); sumd = sum(datta) sumlogd = sum(log(datta)) value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd return(value) } # End function gmll # MOM for starting values momalpha = mean(d)^2/var(d); momalpha mombeta = var(d)/mean(d); mombeta gsearch = optim(par=c(momalpha,mombeta), fn = gmll, method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d) gsearch thetahat = gsearch$par names(thetahat) = c("alpha-hat","beta-hat"); thetahat Vhat_n = solve(gsearch$hessian) # Asymptotic Covariance matrix of thetahat # gmll0 is minus LL gamma log likelihood with alpha=beta gmll0 = function(alpha,datta) gmll(c(alpha,alpha),datta) gsearch0 = optim(par=mean(thetahat), fn = gmll0, method = "L-BFGS-B", lower = 0, datta=d) gsearch0 alphahat0 = gsearch0$par; alphahat0 Gsq = 2*(gsearch0$value-gsearch$value); Gsq pval = 1-pchisq(Gsq,df=1); pval # 0.03861777 # Wald test for comparison source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt") LL = rbind(c(1,-1)) Wtest(LL,Tn=thetahat,Vn=Vhat_n) # Z-test of H0: beta = 3 se = sqrt(Vhat_n[2,2]) # Assignning names because otherwise everything is labelled "betahat" z = (thetahat[2]-3)/se; names(z) = "Z statistic"; z pval = 2*(1-pnorm(abs(z))); names(pval) = "p-value"; pval # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 6. Regression with Random Explanatory Variables # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ################## Simulate multivariate regression ################## rm(list=ls()) # Remove everything # Set parameter values # Regression coefficients beta10 = 1; beta11 = 1; beta12 = 0; beta13 = 3 beta20 = 2; beta21 = 0; beta22 = 4; beta23 = 0 # Expected values of x variables mux = c(10,20,10) # Variance-covariance matrix of x variables Phi = rbind(c(25, 25, 15), c(25, 100, 35), c(15, 35, 25)) # Variance-covariance matrix of error terms Psi = rbind(c(500, 750), c(750, 2000)) # Simulate data source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt") set.seed(9999) # First an experiment x = rmvn(nn=100000, mu=mux, sigma=Phi) dim(x) head(x) apply(x,MARGIN=2,FUN=mean) # Column sample means mux # Population means, for comparison var(x) # Sample variance-covariance matrix with n-1 Phi # Population variance-covariance matrix, for comparison # Now simulate data from the model n = 500 x = rmvn(nn=n, mu=mux, sigma=Phi) epsilon = rmvn(nn=n, mu=c(0,0), sigma=Psi) # Extract variables (for clarity) x1 = x[,1]; x2 = x[,2]; x3 = x[,3] epsilon1 = epsilon[,1]; epsilon2 = epsilon[,2] # Generate y y1 = beta10 + beta11*x1 + beta12*x2 + beta13*x3 + epsilon1 y2 = beta20 + beta21*x1 + beta22*x2 + beta23*x3 + epsilon2 length(y1) # Calculate MOM estimate of beta1 (the slopes) y = cbind(y1,y2) Sigmahat_x = var(x) * (n-1)/n Sigmahat_xy = var(x,y) * (n-1)/n beta1hat = t(Sigmahat_xy) %*% solve(Sigmahat_x) round(beta1hat,3) # Now true beta1 for comparison beta1 = rbind(c(beta11, beta12, beta13), c(beta21, beta22, beta23)) beta1 # MOM = Least squares # MOM estimate of slopes round(beta1hat,3) # Least squares estimate LSbetahat = lsfit(x,y)$coefficients #$ t(round( LSbetahat ,3)) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 7. Omitted Variables # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # The age by strength plot omitting human versus chimp. # Simpson's paradox rm(list=ls()) source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt") set.seed(9999) n = 100 # Each S0 = rbind(c(1.00,9.60), c(9.60,144.0)) chimps = rmvn(n,c(5.5,300),S0) humans = rmvn(n,c(17.5,100),S0) Age = c(chimps[,1],humans[,1]) Strength = c(chimps[,2],humans[,2]) plot(Age,Strength,pch=' ') title("Age and Strength") points(chimps,pch='*'); points(humans,pch='o') # summary(lm(Strength~Age)) x = c(2,20); y = 383.8795 - 16.1797 * x lines(x,y) text(9,300,"Chimps",font=2); text(14,100,"Humans",font=2) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 8. Credit Card Data with lavaan (Instrumental Variables) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Income and credit card debt # For a sample of real estate agents in different towns and cities, record # HomePrice: Median price of resale home, in 100k units # Income: Income last year after taxes, in thousands # CardDebt: Credit card balance carried forward, in thousands rm(list=ls()) cards = read.table("https://www.utstat.toronto.edu/~brunner/openSEM/data/cards2.data.txt") # Explore the data head(cards); dim(cards) summary(cards) cor(cards) with(cards, hist(HomePrice) ) with(cards, hist(Income) ) with(cards, hist(CardDebt) ) pairs(cards) # Generate the graphics directly jpeg(file = 'hist1.jpg'); with(cards, hist(HomePrice) ); dev.off() jpeg(file = 'hist2.jpg'); with(cards, hist(Income) ); dev.off() jpeg(file = 'hist3.jpg'); with(cards, hist(CardDebt) ); dev.off() jpeg(file = 'pairs.jpg'); with(cards, pairs(cards) ); dev.off() # Ordinary regression summary(lm(CardDebt ~ Income, data=cards)) # "Control" for median house price summary(lm(CardDebt ~ Income + HomePrice, data=cards)) # Now lavaan # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) # Ordinary regression with random X mod0 = 'CardDebt ~ beta0*1 + beta1*Income Income ~~ Phi*Income # Var(Income) = phi Income ~ mu*1 # E(Income) = mu CardDebt ~~ psi*CardDebt # Var(epsilon) = psi ' # End of model string fit0 = lavaan(mod0, data=cards) summary(fit0) summary(lm(CardDebt ~ Income, data=cards)) 1.883^2 # MSE from lm c( mean(cards$Income), var(cards$Income) ) # Instrumental variables model (Called Model Three in lecture) mod1 = 'CardDebt ~ beta0*1 + beta1*Income Income ~ mux*1 # E(Income) = mux HomePrice ~ muz*1 # E(HomePrice) = muz Income ~~ phix*Income # Var(Income) = phix HomePrice ~~ phiz*HomePrice # Var(HomePrice) = phiz Income ~~ kappa*HomePrice # Cov(Income,HomePrice) = kappa CardDebt ~~ psi*CardDebt # Var(epsilon) = psi Income ~~ c*CardDebt # Cov(Income,epsilon) = c ' # End of model string fit1 = lavaan(mod1, data=cards) summary(fit1) # Checking explicit formula for the MLE. # (x,y,z) = (Income, CardDebt, HomePrice) # LaTeX # \widehat{\boldsymbol{\beta}}_1 = \widehat{\boldsymbol{\Sigma}}_{23} # \widehat{\boldsymbol{\Sigma}}_{13}^{-1} with(cards, var(CardDebt,HomePrice) / var(Income,HomePrice) ) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 10. Simulation of Measurement Error in the Explanatory Variables # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Simulate regression with measurement error in x variables source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt") n = 500 # Sample size # Regression coefficients beta0 = 1; beta1 = 1; beta2 = 0 # Expected values of x variables mux = c(10,20) # Variance-covariance matrix of x variables: Correlation = 0.75 Phi = rbind(c(20, 30), c(30, 80)) # Variance of epsilon psi = 50 # Variances of measurement error terms: Both reliabilities = 0.80 omega1 = 5; omega2 = 20 # Simulate data set.seed(9999) X = rmvn(n,mux,Phi); head(X) x1 = X[,1]; x2 = X[,2] epsilon = rnorm(n,0,sqrt(psi)) e1 = rnorm(n,0,sqrt(omega1)); e2 = rnorm(n,0,sqrt(omega2)) y = beta0 + beta1*x1 + beta2*x2 + epsilon w1 = x1 + e1 w2 = x2 + e2 d = cbind(w1,w2,y); cor(d) mod = lm(y ~ w1 + w2) summary(mod) summary(mod)$coefficients[3,4] onesim = function() { X = rmvn(n,mux,Phi); head(X) x1 = X[,1]; x2 = X[,2] epsilon = rnorm(n,0,sqrt(psi)) e1 = rnorm(n,0,sqrt(omega1)); e2 = rnorm(n,0,sqrt(omega2)) y = beta0 + beta1*x1 + beta2*x2 + epsilon w1 = x1 + e1 w2 = x2 + e2 mod = lm(y ~ w1 + w2) return(summary(mod)$coefficients[3,4]) } # End of onesim onesim() ######### mereg <- function(beta0=1, beta1=1, beta2=0, sigmasq = 0.5, mu1=0, mu2=0, phi11=1, phi22=1, phi12 = 0.80, rel1=0.80, rel2=0.80, n=200) #################################################################### # Model is Y = beta0 + beta1 X1 + beta2 X2 + epsilon # W1 = X1 + e1 # W2 = W2 + e2 # Fit naive model # Y = beta0 + beta1 W1 + beta2 W2 + epsilon # Inputs are # # beta0, beta1 beta2 True regression coefficients # sigmasq Var(epsilon) # mu1 E(X1) # mu2 E(X2) # phi11 Var(X1) # phi22 Var(X2) # phi12 Cov(X1,X2) = Corr(X1,X1), because # Var(X1) = Var(X2) = 1 # rel1 Reliability of W1 # rel2 Reliability of W2 # n Sample size # Note: This function uses rmvn, a multivariate normal random number # generator I wrote. The rmultnorm of the package MSBVAR does # the same thing but I am having trouble installing it. #################################################################### { # Calculate SD(e1) and SD(e2) sd1 <- sqrt((phi11-rel1)/rel1) sd2 <- sqrt((phi22-rel2)/rel2) # Random number generation epsilon <- rnorm(n,mean=0,sd=sqrt(sigmasq)) e1 <- rnorm(n,mean=0,sd=sd1) e2 <- rnorm(n,mean=0,sd=sd2) # X1 and X2 are bivariate normal. Need rmvn function. Phi <- rbind(c(phi11,phi12), c(phi12,phi22)) X <- rmvn(n, mu=c(mu1,mu2), sigma=Phi) # nx2 matrix X1 <- X[,1]; X2 <- X[,2] # Now generate Y, W1 and W2 Y = beta0 + beta1*X1 + beta2*X2 + epsilon W1 = X1 + e1 W2 = X2 + e2 # Fit the naive model mereg <- summary(lm(Y~W1+W2))$coefficients mereg # Returns table of beta-hats, SEs, t-statistics and p-values } # End function mereg # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 13. Little double measurement regression with lavaan # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # rm(list=ls()); options(scipen=999) # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) babydouble = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/Babydouble.data.txt") head(babydouble) dim(babydouble) summary(babydouble) cor(babydouble) dmodel1 = 'Y ~ beta1*X # Latent variable model (even though Y is observed) X =~ 1*W1 + 1*W2 # Measurement model # Variances (covariances would go here too) X~~phi*X # Var(X) = phi Y~~psi*Y # Var(epsilon) = psi W1~~omega1*W1 # Var(e1) = omega1 W2~~omega2*W2 # Var(e2) = omega2 ' dfit1 = lavaan(dmodel1, data=babydouble) summary(dfit1) parameterEstimates(dfit1) parTable(dfit1) coef(dfit1) # A vector of MLEs fitted(dfit1) # Sigma(thetahat) and mu(thetahat) vcov(dfit1) logLik(dfit1) # Fit a restricted model (restricted by H0) dfit1r = lavaan(dmodel1, data=babydouble, constraints = 'omega1==omega2') anova(dfit1r,dfit1) # Put multiple constraints on separate lines, like this. dfit1r2 = lavaan(dmodel1, data=babydouble, constraints = 'omega1==omega2 phi==1') anova(dfit1r2,dfit1) # For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h source("http://www.utstat.utoronto.ca/~brunner/Rfunctions/Wtest.txt") LL = cbind(0,0,0,1,-1); LL Wtest(LL,coef(dfit1),vcov(dfit1)) # Non-linear functions of the parameters with := dmodel1b = 'Y ~ beta1*X # Latent variable model X =~ 1*W1 + 1*W2 # Measurement model # Variances (covariances would go here too) X~~phi*X # Var(X) = phi Y~~psi*Y # Var(epsilon) = psi W1~~omega1*W1 # Var(e1) = omega1 W2~~omega2*W2 # Var(e2) = omega2 diff := omega1-omega2 rel1 := phi/(omega1+phi) ' dfit1b = lavaan(dmodel1b, data=babydouble) parameterEstimates(dfit1b) sqrt(0.01918586) # Compare Z statistic for H0: omega1=omega2 # And one attempt to fit a non-identified model dmodel0 = 'Y ~ beta1*X # Latent variable model (even though Y is observed) X =~ 1*W1 # Measurement model # Variances (covariances would go here too) X~~phi*X # Var(X) = phi Y~~psi*Y # Var(epsilon) = psi W1~~omega1*W1 # Var(e1) = omega1 ' dfit0 = lavaan(dmodel0, data=babydouble) summary(dfit0) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # 14. Bootstrap # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Testing differences between error variances, to see # if we have equivalent measurements. rm(list=ls()) # Parameter values and sample size phi = 7; omega1 = 3; omega2 = 3 rel1 = round(phi/(phi+omega1),3); rel2 = round(phi/(phi+omega2),3) c(rel1,rel2) # Reliabilities n = 1500 # Simulate from t distribution -- heavy tails # Var(t) = nu/(nu-2) set.seed(9999) x = sqrt(phi) * rt(n,3)/sqrt(3) e1 = sqrt(omega1) * rt(n,3)/sqrt(3); e2 = sqrt(omega2) * rt(n,3)/sqrt(3) w1 = x + e1; w2 = x + e2 ww = cbind(w1,w2) vcovW = var(ww) * (n-1)/n; vcovW # Divide by n to get MLEs # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) # Normal theory with lavaan mod = "x =~ 1.0*w1 + 1.0*w2 x ~~ phi*x; w1 ~~ omega1*w1; w2 ~~ omega2*w2 vardiff := omega1-omega2 " fit = lavaan(mod, data=ww) # summary(fit) parameterEstimates(fit) thetahat = coef(fit); thetahat # Bootstrap the "hard" way # n = dim(ww)[1] is not needed jar = 1:n; B = 1000 tstar = matrix(NA,B,3) # Rows will hold theta-hat values colnames(tstar) = names(coef(fit)) for(j in 1:B) { rowz = sample(jar,size=n,replace=TRUE) xstar = ww[rowz,] fitstar = lavaan(mod, data=xstar) tstar[j,] = coef(fitstar) } # Next bootstrap sample head(tstar) vdiff = tstar[,2] - tstar[,3] # Vector of omega1hat - omega2hat values hist(vdiff) shapiro.test(vdiff) # Test of normality var(vdiff) bootse = sqrt(var(vdiff)) bootse # Compare normal theory estimate of 0.364 z = (thetahat[2]-thetahat[3])/bootse; z # Compare z = 2.124 # Now bootstrap with lavaan: The easy way fitB = lavaan(mod, data=ww, se = "bootstrap") parameterEstimates(fitB) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 15. The BMI Health Study with lavaan # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # rm(list=ls()); options(scipen=999) bmidata = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bmi.data.txt") head(bmidata) dim(bmidata) ################################### # Naive surface regression # ################################### bmi = within(bmidata,{ age = (age1+age2)/2; bmi = (bmi1+bmi2)/2; fat = (fat1+fat2)/2 cholest = (cholest1+cholest2)/2; diastol = (diastol1+diastol2)/2 }) dim(bmi) fullmod = lm( cbind(cholest,diastol) ~ age + fat + bmi, data=bmi) summary(fullmod) restrictedmod = update(fullmod, . ~ . - bmi) # Remove var(s) being tested anova(fullmod,restrictedmod) # Gives multivariate test. ################################### # Structural equation model # ################################### # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) bmimodel1 = '######################################################## # Latent variable model # --------------------- Lcholest ~ beta11*Lage + beta12*Lbmi + beta13*Lfat Ldiastol ~ beta21*Lage + beta22*Lbmi + beta23*Lfat # # Measurement model # ----------------- Lage =~ 1*age1 + 1*age2 Lbmi =~ 1*bmi1 + 1*bmi2 Lfat =~ 1*fat1 +1*fat2 Lcholest =~ 1*cholest1 + 1*cholest2 Ldiastol =~ 1*diastol1 + 1*diastol2 # # Variances and covariances # ------------------------- # Of latent explanatory variables Lage ~~ phi11*Lage; Lage ~~ phi12*Lbmi; Lage ~~ phi13*Lfat Lbmi ~~ phi22*Lbmi; Lbmi ~~ phi23*Lfat Lfat ~~ phi33*Lfat # Of error terms in latent the regression (epsilon_ij) Lcholest ~~ psi11*Lcholest; Lcholest ~~ psi12*Ldiastol Ldiastol ~~ psi22*Ldiastol # Of measurement errors (e_ijk) for measurement set 1 age1 ~~ w111*age1; age1 ~~ w112*bmi1; age1 ~~ w113*fat1; age1 ~~ w114*cholest1; age1 ~~ w115*diastol1 bmi1 ~~ w122*bmi1; bmi1 ~~ w123*fat1; bmi1 ~~ w124*cholest1; bmi1 ~~ w125*diastol1 fat1 ~~ w133*fat1; fat1 ~~ w134*cholest1; fat1 ~~ w135*diastol1 cholest1 ~~ w144*cholest1; cholest1 ~~ w145*diastol1 diastol1 ~~ w155*diastol1 # Of measurement errors (e_ijk) for measurement set 2 age2 ~~ w211*age2; age2 ~~ w212*bmi2; age2 ~~ w213*fat2; age2 ~~ w214*cholest2; age2 ~~ w215*diastol2 bmi2 ~~ w222*bmi2; bmi2 ~~ w223*fat2; bmi2 ~~ w224*cholest2; bmi2 ~~ w225*diastol2 fat2 ~~ w233*fat2; fat2 ~~ w234*cholest2; fat2 ~~ w235*diastol2 cholest2 ~~ w244*cholest2; cholest2 ~~ w245*diastol2 diastol2 ~~ w255*diastol2 ' ################# End of bmimodel1 ################# fit1 = lavaan(bmimodel1, data=bmidata) # Lots of warnings, 4241 iterations summary(fit1) p1 = parTable(fit1); p1 fit1B = lavaan(bmimodel1, data=bmidata, start = 'Mplus') # Same warnings, 4241 iterations again # Start at Method of Moment estimates. head(bmidata) W1 = as.matrix(bmidata[,1:3]) # age1 bmi1 fat1 V1 = as.matrix(bmidata[,4:5]) # cholest1 diastol1 W2 = as.matrix(bmidata[,6:8]) # age2 bmi2 fat2 V2 = as.matrix(bmidata[,9:10]) # cholest2 diastol2 var(W1,W2) # Matrix of sample covariances # Using S as short for Sigmahat, and not worrying about n vs. n-1, S11 = var(W1); S12 = var(W1,V1); S13 = var(W1,W2); S14 = var(W1,V2) S22 = var(V1); S23 = var(V1,W2); S24 = var(V1,V2) S33 = var(W2); S34 = var(W2,V2) S44 = var(V2) # The matrices below should all have "hat" in the name, because they are estimates Phi = (S13+t(S13))/2 rownames(Phi) = colnames(Phi) = c('Lage','Lbmi','Lfat'); Phi # To my surprise, these are quite close to the MLEs from the first run. Beta = 0.5*(t(S14)+S23) %*% solve(Phi) rownames(Beta) = c('Lcholest','Ldiastol') colnames(Beta) = c('Lage','Lbmi','Lfat'); Beta # These are miles away from the supposed MLEs # Can just say some of the rest are close and others are not. Psi = S24 - Beta %*% Phi %*% t(Beta) rownames(Psi) = colnames(Psi) = c('Lcholest','Ldiastol') # epsilon1, epsilon2 Psi # Oops, it should be symmetric. Psi = ( Psi+t(Psi) )/2; Psi # Again, far away. Omega11 = S11 - Phi; Omega11 # Supposed MLEs are pretty close here. Omega12 = S12 - ( S14+t(S23) )/2; Omega12 # Not too bad Omega22 = S22-S24 # A little rough but consistent Omega22 = (Omega22 + t(Omega22) )/2 Omega22 # Variances okay, covariance off. Omega33 = S33 - Phi; Omega33 # Not too bad Omega34 = S34 - ( S14+t(S23) )/2; Omega34 # Not too bad Omega44 = S44 - S24 ; Omega44 = ( Omega44 + t(Omega44) )/2 Omega44 # Not terrible # Carefully assemble the MOM estimates into a vector, same order as p1 = parTable(fit1) mom = c(Beta[1,1], Beta[1,2], Beta[1,3], Beta[2,1], Beta[2,2], Beta[2,3], 1,1,1,1,1,1,1,1,1,1, Phi[1,1], Phi[1,2], Phi[1,3], Phi[2,2], Phi[2,3], Phi[3,3], Psi[1,1], Psi[1,2], Psi[2,2], Omega11[1,1], Omega11[1,2], Omega11[1,3], Omega12[1,1], Omega12[1,2], Omega11[2,2], Omega11[2,3], Omega12[2,1], Omega12[2,2], Omega11[3,3], Omega12[3,1], Omega12[3,2], Omega22[1,1], Omega22[1,2], Omega22[2,2], Omega33[1,1], Omega33[1,2], Omega33[1,3], Omega34[1,1], Omega34[1,2], Omega33[2,2], Omega33[2,3], Omega34[2,1], Omega34[2,2], Omega33[3,3], Omega34[3,1], Omega34[3,2], Omega44[1,1], Omega44[1,2], Omega44[2,2] ) length(mom) p1mom = p1; p1mom[,14] = mom # Replace column 14 (est) with mom fit2 = lavaan(bmimodel1, data=bmidata, start = p1mom); fit2 summary(fit2) parTable(fit2) # Same results, a LOT less mess and effort. This is a notable success. # Exactly on the money; Chi-squared = 4.6537. It took 242 iterations with these very good starting values. SAS took 5 with its automatic starting values. # Now a LR test of BMI, H0: beta12 = beta22 = 0 # Will the constraint conflict with the starting values? nobmi = lavaan(bmimodel1, data=bmidata, start = p1mom, constraints = 'beta12 == 0 beta22 == 0') anova(nobmi,fit2) # Two follow-up issues # Shorter syntax # Bootstrap cat(bmimodel1) bmimodel2 = '######################################################## # Latent variable model # --------------------- Lcholest ~ beta11*Lage + beta12*Lbmi + beta13*Lfat Ldiastol ~ beta21*Lage + beta22*Lbmi + beta23*Lfat # # Measurement model # ----------------- Lage =~ 1*age1 + 1*age2; Lbmi =~ 1*bmi1 + 1*bmi2; Lfat =~ 1*fat1 +1*fat2 Lcholest =~ 1*cholest1 + 1*cholest2; Ldiastol =~ 1*diastol1 + 1*diastol2 # # Variances and covariances DO NOT NEED NAMES! # ------------------------- # Of latent explanatory variables Lage ~~ Lage + Lbmi + Lfat Lbmi ~~ Lbmi + Lfat Lfat ~~ Lfat # Of error terms in the latent regression Lcholest ~~ Lcholest + Ldiastol Ldiastol ~~ Ldiastol # Of measurement errors for measurement set 1 age1 ~~ age1 + bmi1 + fat1 + cholest1 + diastol1 bmi1 ~~ bmi1 + fat1 + cholest1 + diastol1 fat1 ~~ fat1 + cholest1 + diastol1 cholest1 ~~ cholest1 + diastol1 diastol1 ~~ diastol1 # Of measurement errors for measurement set 2 age2 ~~ age2 + bmi2 + fat2 + cholest2 + diastol2 bmi2 ~~ bmi2 + fat2 + cholest2 + diastol2 fat2 ~~ fat2 + cholest2 + diastol2 cholest2 ~~ cholest2 + diastol2 diastol2 ~~ diastol2 ' ################# End of bmimodel2 ################# fit3 = lavaan(bmimodel2, data=bmidata, start = fit2); fit3 summary(fit3) summary(fit2) # for comparison head(parTable(fit2)) head(parTable(fit3)) # Even shorter syntax: Do we really need the beta symbols? bmimodel3 = '#### Regressions #### Lcholest ~ Lage + Lbmi + Lfat Ldiastol ~ Lage + Lbmi + Lfat #### Measurement model #### Lage =~ 1*age1 + 1*age2; Lbmi =~ 1*bmi1 + 1*bmi2; Lfat =~ 1*fat1 +1*fat2 Lcholest =~ 1*cholest1 + 1*cholest2; Ldiastol =~ 1*diastol1 + 1*diastol2 #### Variances and covariances #### Lage ~~ Lage + Lbmi + Lfat Lbmi ~~ Lbmi + Lfat Lfat ~~ Lfat Lcholest ~~ Lcholest + Ldiastol Ldiastol ~~ Ldiastol age1 ~~ age1 + bmi1 + fat1 + cholest1 + diastol1 bmi1 ~~ bmi1 + fat1 + cholest1 + diastol1 fat1 ~~ fat1 + cholest1 + diastol1 cholest1 ~~ cholest1 + diastol1 diastol1 ~~ diastol1 age2 ~~ age2 + bmi2 + fat2 + cholest2 + diastol2 bmi2 ~~ bmi2 + fat2 + cholest2 + diastol2 fat2 ~~ fat2 + cholest2 + diastol2 cholest2 ~~ cholest2 + diastol2 diastol2 ~~ diastol2 ' ################# End of bmimodel3 ################# fit4 = lavaan(bmimodel3, data=bmidata, start = fit2); fit4 summary(fit4) # Bootstrap fit3B = lavaan(bmimodel2, data=bmidata, start = fit2, se = "bootstrap") # Waited between 2 and 3 minutes summary(fit3B) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 18. Brand Awareness with lavaan # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # rm(list=ls()); options(scipen=999) # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) coffee = read.table("http://www.utstat.toronto.edu/brunner/openSEM/data/timmy1.data.txt") head(coffee) # Observed variables # w1 = Brand Awareness 1 # w2 = Brand Awareness 2 # w3 = Ad Awareness 1 # w4 = Ad Awareness 2 # w5 = Interest 1 # w6 = Interest 2 # v1 = Purchase Intention 1 # v2 = Purchase Intention 2 # v3 = Purchase Behaviour 1 # v4 = Purchase Behaviour 2 # Latent variables # L_BrAw = True brand awareness # L_AdAw = True advertising awareness # L_Inter = True interest in the product category # L_PI = True purchase intention # L_PBeh = True purchase behaviour torus1 = ' # Latent variable model L_PI ~ gamma1*L_BrAw + gamma2*L_AdAw + gamma3*L_Inter L_PBeh ~ gamma4*L_Inter + beta*L_PI # Measurement model (simple double measurement) L_BrAw =~ 1*w1 + 1*w2 L_AdAw =~ 1*w3 + 1*w4 L_Inter =~ 1*w5 + 1*w6 L_PI =~ 1*v1 + 1*v2 L_PBeh =~ 1*v3 + 1*v4 # Variances and covariances # Exogenous latent variables L_BrAw ~~ phi11*L_BrAw # Var(L_BrAw) = phi11 L_BrAw ~~ phi12*L_AdAw # Cov(L_BrAw,L_AdAw) = phi12 L_BrAw ~~ phi13*L_Inter # Cov(L_BrAw,L_Inter) = phi13 L_AdAw ~~ phi22*L_AdAw # Var(L_AdAw) = phi22 L_AdAw ~~ phi23*L_Inter # Cov(L_AdAw,L_Inter) = phi23 L_Inter ~~ phi33*L_Inter # Var(L_Inter) = phi33 # Errors in the latent model (epsilons) L_PI ~~ psi1*L_PI # Var(epsilon1) = psi1 L_PBeh ~~ psi2*L_PBeh # Var(epsilon2) = psi2 # Measurement errors w1 ~~ omega1*w1 # Var(e1) = omega1 w2 ~~ omega2*w2 # Var(e2) = omega2 w3 ~~ omega3*w3 # Var(e3) = omega3 w4 ~~ omega4*w4 # Var(e4) = omega4 w5 ~~ omega5*w5 # Var(e5) = omega5 w6 ~~ omega6*w6 # Var(e6) = omega6 v1 ~~ omega7*v1 # Var(e7) = omega7 v2 ~~ omega8*v2 # Var(e8) = omega8 v3 ~~ omega9*v3 # Var(e9) = omega9 v4 ~~ omega10*v4 # Var(e10) = omega10 # Bounds (Variances are positive) phi11 > 0; phi22 > 0; phi33 > 0 psi1 > 0; psi2 > 0 omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 ' # End of model torus1 fit1 = lavaan(torus1, data=coffee) show(fit1) # It did not fit, and matched SAS. # Split the problem up into parts. Look first at the measurement model. torus2 = ' # Measurement model (still simple double measurement) L_BrAw =~ 1*w1 + 1*w2 L_AdAw =~ 1*w3 + 1*w4 L_Inter =~ 1*w5 + 1*w6 L_PI =~ 1*v1 + 1*v2 L_PBeh =~ 1*v3 + 1*v4 # Variances and covariances # Latent variables L_BrAw ~~ phi11*L_BrAw # Var(L_BrAw) = phi11 L_BrAw ~~ phi12*L_AdAw # Cov(L_BrAw, L_AdAw) = phi12 L_BrAw ~~ phi13*L_Inter # Cov(L_BrAw, L_Inter) = phi13 L_BrAw ~~ phi14*L_PI # Cov(L_BrAw, L_PI) = phi14 L_BrAw ~~ phi15*L_PBeh # Cov(L_BrAw, L_PBeh) = phi15 L_AdAw ~~ phi22*L_AdAw # Var(L_AdAw) = phi22 L_AdAw ~~ phi23*L_Inter # Cov(L_AdAw, L_Inter) = phi23 L_AdAw ~~ phi24*L_PI # Cov(L_AdAw, L_PI) = phi24 L_AdAw ~~ phi25*L_PBeh # Cov(L_AdAw, L_PBeh) = phi25 L_Inter ~~ phi33*L_Inter # Var(L_Inter) = phi33 L_Inter ~~ phi34*L_PI # Cov(L_Inter, L_PI) = phi34 L_Inter ~~ phi35*L_PBeh # Cov(L_Inter, L_PBeh) = phi35 L_PI ~~ phi44*L_PI # Var(L_PI) = phi44 L_PI ~~ phi45*L_PBeh # Cov(L_PI, L_PBeh) = phi45 L_PBeh ~~ phi55*L_PBeh # Var(L_PBeh) = phi55 # Measurement errors w1 ~~ omega1*w1 # Var(e1) = omega1 w2 ~~ omega2*w2 # Var(e2) = omega2 w3 ~~ omega3*w3 # Var(e3) = omega3 w4 ~~ omega4*w4 # Var(e4) = omega4 w5 ~~ omega5*w5 # Var(e5) = omega5 w6 ~~ omega6*w6 # Var(e6) = omega6 v1 ~~ omega7*v1 # Var(e7) = omega7 v2 ~~ omega8*v2 # Var(e8) = omega8 v3 ~~ omega9*v3 # Var(e9) = omega9 v4 ~~ omega10*v4 # Var(e10) = omega10 # Bounds (Variances are positive) phi11 > 0; phi22 > 0; phi33 > 0; phi44 > 0; phi55 > 0 omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 ' # End of model torus2 fit2 = lavaan(torus2, data=coffee) show(fit2) # Now try to specify the model more briefly, without naming all the variances and covariances of latent variables. By default, the cfa function makes the loading of the first indicator = 1, and all latent variables are correlated. "If the argument std.lv=TRUE is used, the factor loadings of the first indicator of each latent variable will no longer be fixed to 1." torus2b = ' # Measurement model (still simple double measurement) L_BrAw =~ 1*w1 + 1*w2 L_AdAw =~ 1*w3 + 1*w4 L_Inter =~ 1*w5 + 1*w6 L_PI =~ 1*v1 + 1*v2 L_PBeh =~ 1*v3 + 1*v4 # Leave off everything else and see what happens. ' # End of model torus2b fit2b = cfa(torus2b, data=coffee) show(fit2) show(fit2b) # The measurement model does not fit. Try a true double measurement model, allowing covariances within sets. torus3 = ' # Measurement model (still simple double measurement) L_BrAw =~ 1*w1 + 1*w2 L_AdAw =~ 1*w3 + 1*w4 L_Inter =~ 1*w5 + 1*w6 L_PI =~ 1*v1 + 1*v2 L_PBeh =~ 1*v3 + 1*v4 # Add covariances between measurement error terms, without naming them w1 ~~ w3; w1 ~~ w5; w1 ~~ v1; w1 ~~ v3 w3 ~~ w5; w3 ~~ v1; w3 ~~ v3 w5 ~~ v1; w5 ~~ v3 v1 ~~ v3 w2 ~~ w4; w2 ~~ w6; w2 ~~ v2; w2 ~~ v4 w4 ~~ w6; w4 ~~ v2; w4 ~~ v4 w6 ~~ v2; w6 ~~ v4 v2 ~~ v4 ' # End of model torus3 fit3 = cfa(torus3, data=coffee) lavInspect(fit3, "theta") # eigen(lavInspect(fit3, "theta"))$values # Checking why torus3 left the parameter space. # Obtain MOM estimates for use as starting values. d1 = as.matrix(coffee[,c(1,3,5,7,9)]) # Measurement set one d2 = as.matrix(coffee[,c(2,4,6,8,10)]) # Measurement set two Phi_hat = cov(d1,d2); Phi_hat # Make it symmetric Phi_hat = (Phi_hat + t(Phi_hat) )/2; Phi_hat eigen(Phi_hat)$values # Is it positive definite? Omega1_hat = cov(d1) - Phi_hat Omega2_hat = cov(d2) - Phi_hat eigen(Omega1_hat)$values # Is Omega1_hat positive definite? eigen(Omega2_hat)$values # Is Omega2_hat positive definite? anova(fit2,fit3) torus4 = ' # Measurement model L_BrAw =~ 1*w1 + lambda2*w2 L_AdAw =~ 1*w3 + lambda4*w4 L_Inter =~ 1*w5 + lambda6*w6 L_PI =~ 1*v1 + lambda8*v2 L_PBeh =~ 1*v3 + lambda10*v4 ' # End of model torus4 fit4 = cfa(torus4, data=coffee) show(fit4) # Just edit the measurement model part of model 1 torus5 = ' # Latent variable model L_PI ~ gamma1*L_BrAw + gamma2*L_AdAw + gamma3*L_Inter L_PBeh ~ gamma4*L_Inter + beta*L_PI # Measurement model L_BrAw =~ 1*w1 + lambda2*w2 L_AdAw =~ 1*w3 + lambda4*w4 L_Inter =~ 1*w5 + lambda6*w6 L_PI =~ 1*v1 + lambda8*v2 L_PBeh =~ 1*v3 + lambda10*v4 # Variances and covariances # Exogenous latent variables L_BrAw ~~ phi11*L_BrAw # Var(L_BrAw) = phi11 L_BrAw ~~ phi12*L_AdAw # Cov(L_BrAw,L_AdAw) = phi12 L_BrAw ~~ phi13*L_Inter # Cov(L_BrAw,L_Inter) = phi13 L_AdAw ~~ phi22*L_AdAw # Var(L_AdAw) = phi22 L_AdAw ~~ phi23*L_Inter # Cov(L_AdAw,L_Inter) = phi23 L_Inter ~~ phi33*L_Inter # Var(L_Inter) = phi33 # Errors in the latent model (epsilons) L_PI ~~ psi1*L_PI # Var(epsilon1) = psi1 L_PBeh ~~ psi2*L_PBeh # Var(epsilon2) = psi2 # Measurement errors w1 ~~ omega1*w1 # Var(e1) = omega1 w2 ~~ omega2*w2 # Var(e2) = omega2 w3 ~~ omega3*w3 # Var(e3) = omega3 w4 ~~ omega4*w4 # Var(e4) = omega4 w5 ~~ omega5*w5 # Var(e5) = omega5 w6 ~~ omega6*w6 # Var(e6) = omega6 v1 ~~ omega7*v1 # Var(e7) = omega7 v2 ~~ omega8*v2 # Var(e8) = omega8 v3 ~~ omega9*v3 # Var(e9) = omega9 v4 ~~ omega10*v4 # Var(e10) = omega10 # Bounds (Variances are positive) phi11 > 0; phi22 > 0; phi33 > 0 psi1 > 0; psi2 > 0 omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 ' # End of model torus5 fit5 = lavaan(torus5, data=coffee) # Ouch, seems to have left the parameter space. Suspect problems with the latent variable model, or with the starting values. summary(fit5) coef(fit4) lavInspect(fit5, "cov.lv") lavInspect(fit4, "cov.lv") # True MLE # Careful examination makes me suspect the problem is with the first stage of the latent model. Try for good starting values. # Based on the sub-model y1 = gamma^t x + epsilon1 (details omitted) # Get estimates of gamma and psi1 = Var(epsilon1) # The names of all these quantities should include "hat." Phi = lavInspect(fit4, "cov.lv") Phix = Phi[1:3,1:3]; Phix Phixy = as.matrix(Phi[1:3,4]); Phixy gamma = t(Phixy) %*% solve(Phix); gamma psi1 = Phi[4,4] - as.numeric(gamma %*% Phix %*% t(gamma)); psi1 # These numbers are much more reasonable. See if I can get away with specifying just 10 starting values. Drop the inequality constraints too, since lavaan will issue a warning if any variance estimate is negative. torus6 = ' # Latent variable model L_PI ~ gamma1*L_BrAw + start(0.1996458)*L_BrAw + gamma2*L_AdAw + start(0.3932861)*L_AdAw + gamma3*L_Inter + start(0.5622287)*L_Inter L_PBeh ~ gamma4*L_Inter + beta*L_PI # Measurement model L_BrAw =~ 1*w1 + lambda2*w2 L_AdAw =~ 1*w3 + lambda4*w4 L_Inter =~ 1*w5 + lambda6*w6 L_PI =~ 1*v1 + lambda8*v2 L_PBeh =~ 1*v3 + lambda10*v4 # Variances and covariances # Exogenous latent variables L_BrAw ~~ phi11*L_BrAw + start(19.13510)*L_BrAw # Var(L_BrAw) = phi11 L_BrAw ~~ phi12*L_AdAw + start(12.29660)*L_AdAw # Cov(L_BrAw,L_AdAw) = phi12 L_BrAw ~~ phi13*L_Inter + start(13.50213)*L_Inter # Cov(L_BrAw,L_Inter) = phi13 L_AdAw ~~ phi22*L_AdAw + start(15.91372)*L_AdAw # Var(L_AdAw) = phi22 L_AdAw ~~ phi23*L_Inter + start(11.30579)*L_Inter # Cov(L_AdAw,L_Inter) = phi23 L_Inter ~~ phi33*L_Inter + start(14.98033)*L_Inter # Var(L_Inter) = phi33 # Errors in the latent model (epsilons) L_PI ~~ psi1*L_PI + start(3.206661)*L_PI # Var(epsilon1) = psi1 L_PBeh ~~ psi2*L_PBeh # Var(epsilon2) = psi2 # Measurement errors w1 ~~ omega1*w1 # Var(e1) = omega1 w2 ~~ omega2*w2 # Var(e2) = omega2 w3 ~~ omega3*w3 # Var(e3) = omega3 w4 ~~ omega4*w4 # Var(e4) = omega4 w5 ~~ omega5*w5 # Var(e5) = omega5 w6 ~~ omega6*w6 # Var(e6) = omega6 v1 ~~ omega7*v1 # Var(e7) = omega7 v2 ~~ omega8*v2 # Var(e8) = omega8 v3 ~~ omega9*v3 # Var(e9) = omega9 v4 ~~ omega10*v4 # Var(e10) = omega10 ' # End of model torus6 fit6 = lavaan(torus6, data=coffee) fit6 summary(fit6) parTable(fit6) # Likelihood ratio test of # H0: lambda2 = lambda4 = lambda6 = lambda8 = lambda10 = 1 anova(fit1,fit6) # For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h source("http://www.utstat.utoronto.ca/~brunner/Rfunctions/Wtest.txt") thetahat = coef(fit6); thetahat LL = rbind(c(0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), c(0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)) hh = c(1,1,1,1,1) Wtest(LL,thetahat,vcov(fit6),hh) options(scipen=999) Wtest(LL,thetahat,vcov(fit6),hh) # Which ones are different from one? pt6 = parTable(fit6); dim(pt6) z = as.numeric( (pt6[,14]-1)/pt6[,15] ) # Extract only meaningful z statistics (lambda_j) z = z[c(7,9,11,13,15)] names(z) = c('lambda2', 'lambda4', 'lambda6', 'lambda8', 'lambda10') z pt6[c(7,9,11,13,15),14] # Corresponding theta-hats # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 20. Principal Components on the Diversity Data # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx # The replication data are in DiversityReplic.xlsx ddata = read_excel("DiversityExplore.xlsx") # Read local copy ddata = as.data.frame(ddata) # Instead of a "tibble" dim(ddata); head(ddata) quest = as.matrix(ddata[,2:41]); head(quest) Sigma_hat = cor(quest); round(Sigma_hat,3) eigenSigma = eigen(Sigma_hat); ls(eigenSigma) eigenSigma$values lambda_hat = eigenSigma$values lambda_hat/40 # Proportions of explained variance cumsum(lambda_hat/40) # Cumulative sum # Principal Components Z = scale(quest) # Standardize columns (This is a 500 x 40 matrix) C_hat = eigenSigma$vectors Y_hat = Z %*% C_hat # Sample principal components # Looking at the variance-covariance matrix of the principal components, round(var(Y_hat), 4) # Should equal D # I would say either keep just one, or keep 7 y = Y_hat[,1:7] # The first 7 components explain about 70% of the variance. zy = cor(Z,y); round(zy,3) # This is hard to look at. Sign of an eigenvector is arbitrary. y = -Y_hat[,1:7] zy = cor(Z,y); round(zy,3) # Now principal components the easy way # help(prcomp) # Better than princomp pc = prcomp(quest, scale = T) ls(pc) # What's in pc? # center has the sample means before standardization # rotation is C-hat # scale has the standard deviations before standardization # sdev has estimated standard deviations of the components # x is a matrix of the principal components: Y-hat pc$sdev^2 # Eigenvalues lambda_hat # For comparison dim(pc$x) # x is Y-hat pc7 = prcomp(quest, scale = T, rank = 7) # Retain seven principal components round(pc7$rotation, 4) # Just the first 7 columns of C-hat head(pc7$x) # There should be 7 columns round(cor(quest,pc7$x),4) # Correlations of variables with components # Look at the a priori scale Commitment to the Organization pcOrg = prcomp(quest[,1:10], scale = T) # First 10 questions pcOrg$sdev^2 # Eigenvalues round(cor(quest[,1:10],pcOrg$x[,1:3]),4) # Correlations of variables with components # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 22. Exploratory Factor Analysis with R # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Diversity data # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Exploratory Factor Analysis of the Diversity data rm(list=ls()) # install.packages("readxl", dependencies = TRUE) # Only need to do this once library(readxl) # Download the data and put it in your working directory. # https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx # The replication data are in DiversityReplic.xlsx ddata = read_excel("DiversityExplore.xlsx") # Read local copy ddata = as.data.frame(ddata) # Instead of a "tibble" dim(ddata); # head(ddata) quest = as.matrix(ddata[,2:41]); head(quest) # Check eigenvalues and scree plot pc = prcomp(quest, scale = T) # ls(pc) # What's in pc? # center has the sample means before standardization # rotation is C-hat # scale has the standard deviations before standardization # sdev has estimated standard deviations of the components # x is a matrix of the principal components: Y-hat Eigenvalue = pc$sdev^2; Eigenvalue # Seven eigenvalues greater than one # Scree plot plot(1:40,Eigenvalue, xlab = "Principal Component", type = "l", main = "Scree Plot of the Diversity Data") # Zoom in on the first ten plot(1:10,Eigenvalue[1:10], type = "l", xaxp = c(1,10,9), # Tick marks on x axis xlab = "Principal Component", ylab = "Eigenvalue", main = "Scree Plot of the Diversity Data") # Scree plot suggests 6 factors. # help(factanal) fa1 = factanal(quest,factors=6) # rotation="varimax" is the default fa1 # Loadings below 0.1 are blanked out by default L1 = fa1$loadings; print(L1,cutoff=0.3, sort="True") # Check SS loadings = 7.853 for Factor One sum(L1[,1]^2) # This is such a pretty picture. What if no rotation? fa2 = factanal(quest,factors=6, rotation="none"); L2 = fa2$loadings print(L2,cutoff=0.3, sort="True") # Varimax is much nicer. ls(fa1) # It's common to specify a rotation (or accept the default) in the initial fit, but one can also fit a model without rotation, and then rotate the factors as a separate step. varimax(L2) fa1$rotmat varimaxL2 = varimax(L2); print(varimaxL2$loadings, cutoff=0.3, sort="True") # Scales # I am thinking that the a priori scales are pretty good. In fact, they are amazing. # Are they really uncorrelated? # What factor(s) underlie them? # head(quest) Com = apply(quest[,1:10],MARGIN=1, FUN=sum) # Commitment to Organization RelC = apply(quest[,11:15],MARGIN=1, FUN=sum) # Relations with Colleagues RelM = apply(quest[,16:27],MARGIN=1, FUN=sum) # Relations with Management Fair = apply(quest[,28:33],MARGIN=1, FUN=sum) # Fair opportunities for advancement Sat = apply(quest[,34:37],MARGIN=1, FUN=sum) # Job satisfaction SMdiv = apply(quest[,38:40],MARGIN=1, FUN=sum) # Sr. Man commitment to diversity # Checking # head(cbind(quest[,1:10],Com)) # head(cbind(quest[,11:15],RelC)) # head(cbind(quest[,16:27],RelM)) # head(cbind(quest[,28:33],Fair)) # head(cbind(quest[,34:37],Sat)) # head(cbind(quest[,38:40],SM)) scaledat = cbind(Com, RelC, RelM, Fair, Sat, SMdiv) head(scaledat); dim(scaledat) corrmat = cor(scaledat); corrmat # Under H0: rho = 0, t = r * sqrt(n-2) / sqrt(1-r^2) ~ t(n-2) critval = qt(0.975,498); critval # Calculate all the t statistics n = 500; tmat = corrmat * sqrt(n-2) / sqrt(1-corrmat^2) round(tmat,3) # Factor analysis: How many factors? pc2 = prcomp(scaledat, scale = T) Eigenvalue = pc2$sdev^2; Eigenvalue # Scree plot plot(1:6,Eigenvalue, xlab = "Principal Component", type = "l", main = "Scree Plot of the Diversity Scales") # cor(scaledat,pc2$x) # Correlations of variables with principal components # Either one factor or two one = factanal(scaledat,factors=1); L.1 = one$loadings print(one,cutoff=0.3, sort="True") two = factanal(scaledat,factors=2) print(two,cutoff=0.3, sort="True") # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Simulated data # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # rm(list=ls()) # Factor analysis of simulated data # Two orthogonal factors, normal distribution rm(list=ls()) n = 50000 # Huge sample size # True factor loadings have a simple structure like varimax (All communalities = 0.49) # Factor loadings L11 = 0.7; L12 = 0.0 L21 = 0.7; L22 = 0.0 L31 = 0.7; L32 = 0.0 L41 = 0.7; L42 = 0.0 L51 = 0.0; L52 = 0.7 L61 = 0.0; L62 = 0.7 L71 = 0.0; L72 = 0.7 L81 = 0.0; L82 = 0.7 # Error Variances v1 = 1 - L11**2 - L12**2 v2 = 1 - L21**2 - L22**2 v3 = 1 - L31**2 - L32**2 v4 = 1 - L41**2 - L42**2 v5 = 1 - L51**2 - L52**2 v6 = 1 - L61**2 - L62**2 v7 = 1 - L71**2 - L72**2 v8 = 1 - L81**2 - L82**2 # Generate data set.seed(9999) F1 = rnorm(n,0,1); F2 = rnorm(n,0,1) d1 = L11*F1 + L12*F2 + rnorm(n,0,sqrt(v1)) d2 = L21*F1 + L22*F2 + rnorm(n,0,sqrt(v2)) d3 = L31*F1 + L32*F2 + rnorm(n,0,sqrt(v3)) d4 = L41*F1 + L42*F2 + rnorm(n,0,sqrt(v4)) d5 = L51*F1 + L52*F2 + rnorm(n,0,sqrt(v5)) d6 = L61*F1 + L62*F2 + rnorm(n,0,sqrt(v6)) d7 = L71*F1 + L72*F2 + rnorm(n,0,sqrt(v7)) d8 = L81*F1 + L82*F2 + rnorm(n,0,sqrt(v8)) dmat = cbind(d1,d2,d3,d4,d5,d6,d7,d8) factanal(dmat,factors=2,rotation='varimax') # The truth again for comparison Lambda = rbind(c(L11,L12), c(L21,L22), c(L31,L32), c(L41,L42), c(L51,L52), c(L61,L62), c(L71,L72), c(L81,L82) ) Lambda # Example 2: Truth is not like varimax (All communalities = 0.50) # Factor loadings L11 = 0.5; L12 = -0.5 L21 = 0.5; L22 = -0.5 L31 = 0.5; L32 = -0.5 L41 = 0.5; L42 = -0.5 L51 = 0.5; L52 = 0.5 L61 = 0.5; L62 = 0.5 L71 = 0.5; L72 = 0.5 L81 = 0.5; L82 = 0.5 # Error Variances v1 = 1 - L11**2 - L12**2 v2 = 1 - L21**2 - L22**2 v3 = 1 - L31**2 - L32**2 v4 = 1 - L41**2 - L42**2 v5 = 1 - L51**2 - L52**2 v6 = 1 - L61**2 - L62**2 v7 = 1 - L71**2 - L72**2 v8 = 1 - L81**2 - L82**2 # Generate data set.seed(8888) F1 = rnorm(n,0,1); F2 = rnorm(n,0,1) d1 = L11*F1 + L12*F2 + rnorm(n,0,sqrt(v1)) d2 = L21*F1 + L22*F2 + rnorm(n,0,sqrt(v2)) d3 = L31*F1 + L32*F2 + rnorm(n,0,sqrt(v3)) d4 = L41*F1 + L42*F2 + rnorm(n,0,sqrt(v4)) d5 = L51*F1 + L52*F2 + rnorm(n,0,sqrt(v5)) d6 = L61*F1 + L62*F2 + rnorm(n,0,sqrt(v6)) d7 = L71*F1 + L72*F2 + rnorm(n,0,sqrt(v7)) d8 = L81*F1 + L82*F2 + rnorm(n,0,sqrt(v8)) dmat = cbind(d1,d2,d3,d4,d5,d6,d7,d8) notsimple = factanal(dmat,factors=2,rotation='varimax'); notsimple # Is there a possible rotation that would get us close to the truth? # Procrustes rotation # install.packages("MCMCpack", dependencies=TRUE) # Only need to do this once library(MCMCpack) # help(procrustes) # Estimated factor loadings for the second example L = notsimple$loadings; print(L,cutoff=0) Lambda = rbind(c(L11,L12), # True factor loadings c(L21,L22), c(L31,L32), c(L41,L42), c(L51,L52), c(L61,L62), c(L71,L72), c(L81,L82) ) Lambda # True Lambda -- How close can we get to this? # Rotate X to approximate Xstar. It's orthogonal: X R ~ Xstar pro = procrustes(X = L, Xstar = Lambda) round(pro$X.new,2) round( L %*% pro$R , 2) # Rotating "manually" pro$R %*% t(pro$R) # R is an orthogonal matrix # Now try a Procrustes rotation of something unrelated to the truth M = rbind(c(0.30,0.64), c(0.30,0.64), c(0.30,0.64), c(0.30,0.64), c(0.30,0.64), c(0.30,0.64), c(0.30,0.64), c(0.30,0.64) ); M procrustes(X = L, Xstar = M)$X.new round(pro$X.new,2) # Repeating for comparison # The truth is very close to a member of the set of factor matrices that can be reached by an orthogonal rotation. The problem is that we don't know which one. # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 24. Rotating Principal Components with R: Diversity data # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # rm(list=ls()) # install.packages("readxl", dependencies = TRUE) # Only need to do this once library(readxl) # Download the data and put it in your working directory. # https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx # The replication data are in DiversityReplic.xlsx ddata = read_excel("DiversityExplore.xlsx") # Read local copy ddata = as.data.frame(ddata) # Instead of a "tibble" dim(ddata); head(ddata) # Saved DiversityWorkspace file quest = as.matrix(ddata[,2:41]) pc7 = prcomp(quest, scale = T, rank = 7) # Retain seven principal components ls(pc7) # The seven principal components are in pc7$x L = cor(quest,pc7$x) # It really should be Lhat # round(L,4) vm7 = varimax(L); L2 = vm7$loadings print(L2,cutoff=0.3) sum(L2[,3]^2) # Amount of variance explained by component three. sum(L2[,3]^2)/40 # Proportion of variance explained by component three. # If we had the questionnaire, we might be able to understand PC7 # It explains only 3% of the variance. Try dropping it. pc6 = prcomp(quest, scale = T, rank = 6) L = cor(quest,pc6$x) vm6 = varimax(L); L2 = vm6$loadings print(L2,cutoff=0.3) # It's always okay to reverse signs in a column. # It just reverses the meaning of the component. # Reverse columns 2, 4 and 5, and then name the components. L2[,c(2,4,5)] = -L2[,c(2,4,5)] colnames(L2) = c("RelMan", "CommitOrg", "RelCol", "SMCD", "JobSat", "FairOp") print(L2,cutoff=0.3) # Produce rotated components for future use # Display equations here. LateX is # \begin{eqnarray*} # \mathbf{z} & = & \mathbf{Lf} + \mathbf{e} \\ # & = & \mathbf{L} \, \mathbf{R}^\top\mathbf{R} \, \mathbf{f} + \mathbf{e} \\ # & = & (\mathbf{L} \, \mathbf{R}^\top) (\mathbf{R} \, \mathbf{f}) + \mathbf{e} \\ # & = & \mathbf{L}_2 \mathbf{f}^\prime + \mathbf{e} # \end{eqnarray*} % 24 pt n = dim(quest)[1] f = scale(pc6$x) * sqrt(n/(n-1)) # Divide by n, not n-1 fprime = f %*% vm6$rotmat fprime[,c(2,4,5)] = -fprime[,c(2,4,5)] colnames(fprime) = colnames(L2) # Verify L2 = cor(data, fprime) round(cor(quest,fprime),3) # Now use the components data.frame(1:47,colnames(ddata)) reduced_data = cbind(ddata[,42:47], fprime) head(reduced_data) summary(reduced_data) # The demographic variables are character. Fix. # Mostly make variables numeric indicators reduced_data = within(reduced_data, { Gender = as.numeric(Gender) VisMinority = as.numeric(VisMinority) EDUCLevel = as.numeric(EDUCLevel) # Make MaritalStatus a factor, married = 2 is reference category MaritalStatus = factor(MaritalStatus) contrasts(MaritalStatus) = contr.treatment(4, base = 2) Age = as.numeric(Age) CAN_Foreign_Born = as.numeric(CAN_Foreign_Born) }) table(reduced_data$MaritalStatus, useNA = "ifany") # Codes are 1 = Never married, 2 = Married, 3 = Divorced or separated, # 4 = Widowed head(reduced_data) # Quick look at rotated PCs by binary demographics democor = cor(reduced_data[,c(1:3,5,6)], reduced_data[,7:12], use = "pairwise.complete.obs") round(democor,3) # Eliminate all data with any missing values. noNAs = na.omit(reduced_data); dim(noNAs) r = cor( noNAs[,c(1:3,5,6)], noNAs[,7:12] ) round(r , 3) # Test: Are those correlations non-zero? n = dim(noNAs)[1] ttable = r * sqrt(n-2) / sqrt(1-r^2) round(ttable,3) critval = qt(0.975,n-2); critval # Bonferroni correcting for 30 tests: Divide alpha by 30 a = 0.05/30 Ccritval = qt(1-a/2,n-2); Ccritval # Visible minority and job satisfaction: What is the actual mean difference? # Had r = -0.168, t = -3.553 t.test(JobSat ~ VisMinority, var.equal = TRUE, data = reduced_data) # What does a difference of 0.36 SDs look like? z = seq(-3,3,length = 100) Density = dnorm(z, mean=0.108); d2 = dnorm(z, mean = -0.251) plot(z,Density, type = "l") # That's an L lines(z,d2,lty = 1); title("Difference of 0.36 SD Units") # One regression satisf = lm(JobSat ~ Gender + VisMinority + EDUCLevel + MaritalStatus + Age + CAN_Foreign_Born, data = reduced_data) summary(satisf) # Well, one more sat2 = update(satisf, . ~ . - CAN_Foreign_Born) # Eliminate the Americans summary(sat2) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 27. Confirmatory Factor Analysis with R: Body-Mind data # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # CFA of Body-mind data # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) rm(list=ls()) bodymind = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bodymind.data.txt") bodymind = read.table("bodymind.data.txt") # Local copy head(bodymind) # Need a numeric dummy variable for sex/gender n = dim(bodymind)[1]; n bodymind = within(bodymind,{ Sex = numeric(n) Sex[sex == "F"] = 1 # Makes 0=M, 1=F }) # End within bodymind head(bodymind) # Observe that the parameters of the sub-model with Sex and size are identifiable. # In mod1, name all the parameters and use the lavaan function. mod1 = "######################################################################## # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda4*progmat + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 ######################################################################## " # End of mod1 fit1 = lavaan(mod1, data=bodymind) varTable(fit1) # It may be okay, but re-express height in metres bodymind = within(bodymind,{height = height/10}) fit1 = lavaan(mod1, data=bodymind) summary(fit1) # Variance of Sex is really p(1-p) -- Bernoulli. # Estimates are well behaved. I think it's okay. # By the way, the model does not fit. This is an unpleasant surprise, but these are real data. # Fit the model more easily with cfa. Sex is included in the last line. mod2 = "size =~ height + weight smart =~ verbal + progmat + reason Sex ~~ size + smart" # Last line is unusual. fit2 = cfa(mod2, data=bodymind) summary(fit2) # Parameter estimates match perfectly. # "Standardized parameter estimates" (Adds two columns ...) summary(fit2, standardized=TRUE) # Note: Calculation of estimates for the standardized models is described in the text. # The only problem with Std.lv and Std.all is that you don't get standard errors. BMdat = bodymind[,c(2:4, 9:11)] kor = cor(BMdat); round(kor,3) fit3 = cfa(mod2, sample.cov=kor, sample.nobs=80, std.lv=TRUE, sample.cov.rescale=FALSE) summary(fit3, standardized=TRUE) parameterEstimates(fit3) # Perfect match, no complaints # Note how std.lv=TRUE wisely un-did the setting of factor loadings to one. # Why is the model not fitting? # Look at the correlation matrix again. round(kor,3) # Residuals are model-implied covariances (correlations), # minus observed covariances (correlations). lavResiduals(fit2, se=TRUE) # cor by default, se is optional fitted(fit2) # Sigma(thetahat) # How about an additional link between Sex and progmat? # Parameters are identifiable ... mod3 = "size =~ height + weight smart =~ verbal + progmat + reason Sex ~~ size + smart + progmat" fit4 = cfa(mod3, data=bodymind) summary(fit4) # That was ugly and strange. Try fuller model specification mod4 = "######################################################################## # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda4*progmat + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart Sex ~~ c*progmat # Covariance with error term height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 ######################################################################## " # End of mod4 fit4 = lavaan(mod4, data=bodymind) # Tried other things ... # Post to the group and display that # Make the link from smart to progmat a "regression." mod5 = "######################################################################## # Regressions # ------------- progmat ~ lambda4 * smart # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart Sex ~~ c*progmat # Covariance with error term -- an epsilon this time. height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 ######################################################################## " # End of mod5 fit5 = lavaan(mod5, data=bodymind) summary(fit5) # Add functions for the CORRELATIONS between factors mod6 = "######################################################################## # Regressions # ------------- progmat ~ lambda4 * smart # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart Sex ~~ c*progmat # Covariance with error term -- an epsilon this time. height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 # Correlations between factors # ---------------------------- corr12 := phi12/sqrt(phi11*phi22) corr13 := phi13/sqrt(phi11*phi33) corr23 := phi23/sqrt(phi22*phi33) ######################################################################## " # End of mod6 fit6 = lavaan(mod6, data=bodymind) summary(fit6, standardized=TRUE) # parameterEstimates(fit6) # Bring in the head size variables. No need for symbols. mod7 = "######################################################################## # Regressions # ------------- progmat ~ lambda4 * smart headlng ~ Sex + size + smart headbrd ~ Sex + size + smart headcir ~ Sex + size + smart bizyg ~ Sex + size + smart # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart Sex ~~ c*progmat # Covariance with error term -- an epsilon this time. height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 # Variances and covariances between error terms of head variables # --------------------------------------------------------------- headlng ~~ headlng + headbrd + headcir + bizyg headbrd ~~ headbrd + headcir + bizyg headcir ~~ headcir + bizyg bizyg ~~ bizyg # Correlations between factors # ---------------------------- corr12 := phi12/sqrt(phi11*phi22) corr13 := phi13/sqrt(phi11*phi33) corr23 := phi23/sqrt(phi22*phi33) ######################################################################## " # End of mod7 fit8 = lavaan(mod7, data=bodymind); summary(fit8) parameterEstimates(fit8) # fit8b = lavaan(mod7, se="bootstrap", data=bodymind) # parameterEstimates(fit8b) # parameterEstimates(fit8) # For comparison -- no bootstrap # I am not really comfortable with arrows coming from Sex. mod8 = "######################################################################## # Regressions # ------------- progmat ~ lambda4 * smart headlng ~ size + smart headbrd ~ size + smart headcir ~ size + smart bizyg ~ size + smart # Measurement model # ----------------- size =~ 1.0*height + lambda2*weight smart =~ 1.0*verbal + lambda5*reason # Variances and covariances # ------------------------- Sex ~~ phi11*Sex + phi12*size + phi13*smart size ~~ phi22*size + phi23*smart smart ~~ phi33*smart Sex ~~ c*progmat # Covariance with error term -- an epsilon this time. height ~~ omega1*height # Var(e1) = omega1 weight ~~ omega2*weight # Var(e2) = omega2 verbal ~~ omega3*verbal # Var(e3) = omega3 progmat ~~ omega4*progmat # Var(e4) = omega4 reason ~~ omega5*reason # Var(e5) = omega5 # Variances and covariances between error terms of head variables # --------------------------------------------------------------- headlng ~~ headlng + headbrd + headcir + bizyg headbrd ~~ headbrd + headcir + bizyg headcir ~~ headcir + bizyg bizyg ~~ bizyg # Correlations between factors # ---------------------------- corr12 := phi12/sqrt(phi11*phi22) corr13 := phi13/sqrt(phi11*phi33) corr23 := phi23/sqrt(phi22*phi33) ######################################################################## " # End of mod8 fit9 = lavaan(mod8, data=bodymind); summary(fit9) anova(fit8,fit9) # Should be 4 df # Sex is not normally distributed. Try bootstrap. fit10 = lavaan(mod7, data=bodymind, se="bootstrap"); # summary(fit10) parameterEstimates(fit10) parameterEstimates(fit8) # For comparison # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 30. Multi-Group SEM with R # Groups # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) rm(list=ls()) bodymind = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bodymind.data.txt") # bodymind = read.table("bodymind.data.txt") # Local copy head(bodymind) # Basic two-factor model mod1 = "Size =~ height + weight Smart =~ verbal + progmat + reason" fit0 = cfa(mod1, data=bodymind); show(fit0) # Include sex as a grouping factor. # By default, the same model is fitted in all groups. # Different parameter estimates for M and F fit1 = cfa(mod1, data=bodymind, group = "sex") summary(fit1) # Notice that intercepts are included. Model fit should not be affected. # Also notice Size ~~ Smart is significant for F but not M. # Test equal fit. Disregard intercepts and just set the following equal across groups: # lambda2, lambda4, lambda5, phi11, phi12, phi22, omega1, ..., omega5 # There are eleven parameters, should be 11 df. # Parameters can be set equal by giving them the same names, but this is more convenient. fit2 = cfa(mod1, data=bodymind, group = "sex", group.equal = c("loadings", "residuals", "lv.variances", "lv.covariances")) summary(fit2) anova(fit1,fit2) # Data are more likely when the model is less restricted, so fit1-fit2 2*(logLik(fit1) - logLik(fit2) ) # You might think the G^2 test of fit for fit0 and fit2 should be the same, but it can't be. Fit0 does not know about the separate sample covariance matrices for M and F. # Test difference in just Size ~~ Smart mod2 = "Size =~ height + weight Smart =~ verbal + progmat + reason Size ~~ c(phi12,phi12)*Smart" # Vector of parameter names fit3 = cfa(mod2, data=bodymind, group = "sex") summary(fit3) anova(fit1,fit3) # There are also tricks for setting a lot of parameters equal across groups, and then freeing some of them, allowing them to be different. A google search for lavaan tutorial takes you to the right document. # Try radically different models for M and F, just as an experiment. Twomods = " group: 1 progmat ~ verbal + theta*reason group: 2 Size =~ height + weight Smart =~ verbal + progmat + theta*reason " fit4 = cfa(Twomods, data=bodymind, group = "sex") summary(fit4) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 31. Analysis of Unstructured Covariance Matrices with R rm(list=ls()) # install.packages("lavaan", dependencies = TRUE) # Only need to do this once library(lavaan) longIQ = read.table("https://www.utstat.toronto.edu/brunner/data/illegal/origIQ.data.txt") colnames(longIQ) = c("amEduc", "bmIQ", "IQ2", "IQ4", "IQ8", "IQ13") head(longIQ) dim(longIQ) # summary(longIQ) r = round(cor(longIQ), 3); r # Under H0: rho = 0, t = r * sqrt(n-2) / sqrt(1-r^2) ~ t(n-2) n = dim(longIQ)[1]; n critval = qt(0.975,n-2); critval tstat = r * sqrt(n-2) / sqrt(1-r^2); round(tstat,3) # In this model, the only parameters are variances and covariances # of the observable variables. covmod0 = "amEduc ~~ amEduc + bmIQ + IQ2 + IQ4 + IQ8 + IQ13 bmIQ ~~ bmIQ + IQ2 + IQ4 + IQ8 + IQ13 IQ2 ~~ IQ2 + IQ4 + IQ8 + IQ13 IQ4 ~~ IQ4 + IQ8 + IQ13 IQ8 ~~ IQ8 + IQ13 IQ13 ~~ IQ13" fit0 = lavaan(covmod0,data=longIQ) summary(fit0) # Okay, that's kind of reasonable. Try to make them correlations. fit0.1 = lavaan(covmod0, data=longIQ, std.ov=TRUE) # summary(fit0.1) fitted(fit0.1) # Sigma(thetahat) r # For comparison fitted(fit0.1)$cov * n/(n-1) # So asymptotically it's fine, but I want the MLEs to be sample correlations. # Standardize the variables myself. standIQ = scale(longIQ) * sqrt(n/(n-1)) fit0.2 = lavaan(covmod0, data=standIQ) fitted(fit0.2); r # Sigma(thetahat) should be correlation matrix # Now want names for parameters so I can test differences, etc. covmod1 = "amEduc ~~ amEduc + rho12*bmIQ + rho13*IQ2 + rho14*IQ4 + rho15*IQ8 + rho16*IQ13 bmIQ ~~ bmIQ + rho23*IQ2 + rho24*IQ4 + rho25*IQ8 + rho26*IQ13 IQ2 ~~ IQ2 + rho34*IQ4 + rho35*IQ8 + rho36*IQ13 IQ4 ~~ IQ4 + rho45*IQ8 + rho46*IQ13 IQ8 ~~ IQ8 + rho56*IQ13 IQ13 ~~ IQ13" fit1 = lavaan(covmod1, data=standIQ) # This will be the full model. parameterEstimates(fit1) # fitted(fit1); r # Checks # Likelihood ratio test of relationship between adoptive mom's education # and kid's IQ at any age noAM = lavaan(covmod1, data=standIQ, constraints = "rho13==0 rho14==0 rho15==0 rho16==0") anova(noAM,fit1) # Likelihood ratio test of relationship between birth mom's IQ # and kid's IQ at any age noBM = lavaan(covmod1, data=standIQ, constraints = "rho23==0 rho24==0 rho25==0 rho26==0") anova(noBM,fit1) # Multivariate regression yields p = 0.022 # ntheory = lm(cbind(IQ2, IQ4, IQ8, IQ13) ~ bmIQ, data = longIQ) # summary(ntheory) # t statistics agree with matrix tstat # anova(ntheory) # Likelihood ratio test: Is birth mother's IQ correlated with child's IQ # differently at different ages? BMsame = lavaan(covmod1, data=standIQ, constraints = "rho23==rho24 rho24==rho25 rho25==rho26") anova(BMsame,fit1) # This is surprising. Try a Wald test # For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt") Vhat = vcov(fit1) # Asymptotic covariance matrix of the estimates rhohat = coef(fit1); rhohat length(rhohat) rbind(names(rhohat)) L0 = matrix(0,3,21) L0[1,8] = 1; L0[1,9] = -1 # rho23==rho24 L0[2,9] = 1; L0[2,10] = -1 # rho24==rho25 L0[3,10] = 1; L0[3,11] = -1 # rho25==rho26 Wtest(L0,rhohat,Vhat) # Just test 2 years old versus 13 Two_vs_13 = matrix(0,1,21); Two_vs_13[1,8] = 1; Two_vs_13[1,11] = -1 Wtest(Two_vs_13,rhohat,Vhat) # However, Bonferroni correcting for 6 pairwise tests, need p < ... 0.05/6 # One could bootstrap, but IQ scores are well known to be normally distributed. # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #