# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 431s23code.txt: R code from lecture. The only explanation is in the comment statements. 
#                 Last revised March 8, 2023
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 2. More Linear Algebra
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

is.matrix(3)  # Is the number 3 a 1x1 matrix?
treecorr = cor(trees); treecorr
is.matrix(treecorr)
# Bind rows of a matrix together
A = rbind(  c(3, 2, 6,8),
            c(2,10,-7,4),
            c(6, 6, 9,1)   ); A
# Transpose
t(A)
# U = A A' (3x3), V = A' A (4x4)
U = A %*% t(A)
V = t(A) %*% A; V
# U = A A' (3x3), V = A' A (4x4)
# So rank(V) cannot exceed 3 and det(V)=0
det(U); det(V)
# Recall U = A A' (3x3), V = A' A (4x4)
solve(U)
solve(V)
# Recall  U = A A' (3x3), V = A' A (4x4)
eigen(U)
eigen(V)
eigenV = eigen(V)
C = eigenV$vectors; D = diag(eigenV$values); D
# C is an orthoganal matrix
C %*% t(C)
V;   C %*% D %*% t(C)
sqrtV = C %*% sqrt(D) %*% t(C)
# Multiply to get V
sqrtV %*% sqrtV;  V
D; sqrt(D)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 3. Random Vectors and Multivariate Normal
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt")
A = rbind(c(1.0,0.5),
          c(0.5,1.0))
A 
datta = rmvn(10,mu=c(0,0),sigma=A); datta 

#              rmvn: Simulate from multivariate normal
rmvn <- function(nn,mu,sigma)
# Returns an nn by kk matrix, rows are independent MVN(mu,sigma)
    {
    kk <- length(mu)
    dsig <- dim(sigma)
    if(dsig[1] != dsig[2]) stop("Sigma must be square.")
    if(dsig[1] != kk) stop("Sizes of sigma and mu are inconsistent.")
    ev <- eigen(sigma)
    if(min(eigen(sigma)$values) < 0)
      stop("Sigma must have non-negative eigenvalues.")
    sqrl <- diag(sqrt(ev$values))
    PP <- ev$vectors
    ZZ <- rnorm(nn*kk) ; dim(ZZ) <- c(kk,nn)
    out <- t(PP%*%sqrl%*%ZZ+mu)
    return(out)
    }# End of function rmvn

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 4. Estimation
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# Gamma MLE
rm(list=ls())

# Simulate data
set.seed(3201); n = 50; alpha=2; beta=3
d = round(rgamma(50,shape=alpha, scale=beta),2); d
mean(d); alpha*beta # Compare
var(d); alpha*beta^2

gmll = function(theta,datta) 
     {
       aa =  theta[1]; bb =  theta[2]
       nn = length(datta); sumd = sum(datta)
       sumlogd = sum(log(datta))
       value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd
       return(value)
     } # End function gmll

# help(optim)

# MOM for starting values
momalpha = mean(d)^2/var(d); momalpha
mombeta = var(d)/mean(d); mombeta

# Error message says: "Bounds can only be used with method 
#                      L-BFGS-B (or Brent)"
gsearch = optim(par=c(momalpha,mombeta), fn = gmll, 
                method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d)
gsearch
thetahat = gsearch$par
names(thetahat) = c("alpha-hat","beta-hat"); thetahat

# Second derivative test
eigen(gsearch$hessian)$values 

############## Later ##############

# Had thetahat = gsearch$par
thetahat
# Asymptotic variance-covariance matrix is the inverse of the 
# Fisher Information
Vhat_n = solve(gsearch$hessian); Vhat_n 

# Confidence interval for alpha (true value is 2)
thetahat
se_alphahat = sqrt(Vhat_n[1,1])
lower95 = thetahat[1] - 1.96*se_alphahat
upper95 = thetahat[1] + 1.96*se_alphahat
c(lower95,upper95)

#########################################################################

# Picture of convergence in probability, using R
# Layer 1
rm(list=ls())
x = seq(from=-3,to=3,by=0.01)
sigma=1/2; raise=0.25; z=1.3
Density = dnorm(x,mean=0,sd=sigma)+raise
plot(x,Density,axes=F,xlab='',ylab='',pch=' ')
lines(x,dnorm(x)+raise,lty=2)
lines(c(-3,3),c(raise,raise),lty=1)
text(x=0,y=raise-0.015,expression(theta))
text(x=-z+.20,y=raise-0.015,expression(paste("(",theta-epsilon)))
text(x= z-.20,y=raise-0.015,expression(paste(theta+epsilon,")")))
lines(c(-z,-z),c(raise,raise+dnorm(-z)),lty=2)
lines(c(z,z),c(raise,raise+dnorm(z)),lty=2)
text(x=0,y=dnorm(0,sd=sigma)+raise,".") # Guide for cropping
# Save this picture as convergence1.pdf, and proceed
#########################################################################
# Layer 2
lines(x,Density,lty=1)
lines(c(-z,-z),c(raise,raise+dnorm(-z,sd=sigma)),lty=1)
lines(c(z,z),c(raise,raise+dnorm(z,sd=sigma)),lty=1)
# Save as convergence2.pdf


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 5. Testing Hypotheses
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
# Tn is estimated theta, usually a vector.
# Vn is the estimated asymptotic covariance matrix of Tn.  
# For Wald tests based on numerical MLEs, Tn = theta-hat,
# and Vn is the inverse of the Hessian of the minus log 
# likelihood. 
     {
     Wtest = numeric(3)
     names(Wtest) = c("W","df","p-value")
     r = dim(L)[1]
     W = t(L%*%Tn-h) %*% solve(L%*%Vn%*%t(L)) %*%
          (L%*%Tn-h)
     W = as.numeric(W)
     pval = 1-pchisq(W,r)
     Wtest[1] = W; Wtest[2] = r; Wtest[3] = pval
     Wtest
     } # End function Wtest

# Plotting the parameter space

pdf(file = "ParameterSpace.pdf")
# Plot the parameter space
x = seq(from=.01,to=5,length=20); y=x
plot(x,y,type='l',xlab=expression(alpha), ylab=expression(beta))
title('Parameter Space')
dev.off() ############ Saved in working directory ############

pdf(file = "MLEplot.pdf")
# Plot the MLEs
x = seq(from=.01,to=5,length=20); y=x
plot(x,y,type='l',xlab=expression(alpha), ylab=expression(beta))
points(thetahat[1],thetahat[2], pch=19, col = "red1")  # Unrestricted MLE
points(alphahat0,alphahat0, pch=23, bg="black") # Restricted MLE
title('Red point at unrestricted MLE, Black point at restricted MLE')
dev.off() ############ Saved in working directory ############

# Gamma: testing H0: alpha = beta

rm(list=ls())

# Simulate data
set.seed(3201); n = 50; alpha=2; beta=3
d = round(rgamma(50,shape=alpha, scale=beta),2); d

gmll = function(theta,datta) 
     {
       aa =  theta[1]; bb =  theta[2]
       nn = length(datta); sumd = sum(datta)
       sumlogd = sum(log(datta))
       value = nn*aa*log(bb) + nn*lgamma(aa) + sumd/bb - (aa-1)*sumlogd
       return(value)
     } # End function gmll

# MOM for starting values
momalpha = mean(d)^2/var(d); momalpha
mombeta = var(d)/mean(d); mombeta

gsearch = optim(par=c(momalpha,mombeta), fn = gmll, 
                method = "L-BFGS-B", lower = c(0,0), hessian=TRUE, datta=d)
gsearch
thetahat = gsearch$par
names(thetahat) = c("alpha-hat","beta-hat"); thetahat
Vhat_n = solve(gsearch$hessian) # Asymptotic Covariance matrix of thetahat

# gmll0 is minus LL gamma log likelihood with alpha=beta
gmll0 = function(alpha,datta) gmll(c(alpha,alpha),datta)

gsearch0 = optim(par=mean(thetahat), fn = gmll0, 
                method = "L-BFGS-B", lower = 0,  datta=d)
gsearch0
alphahat0 = gsearch0$par; alphahat0

Gsq = 2*(gsearch0$value-gsearch$value); Gsq
pval = 1-pchisq(Gsq,df=1); pval
# 0.03861777

# Wald test for comparison
source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt")
LL = rbind(c(1,-1))
Wtest(LL,Tn=thetahat,Vn=Vhat_n)

# Z-test of H0: beta = 3
se = sqrt(Vhat_n[2,2])
# Assignning names because otherwise everything is labelled "betahat"
z = (thetahat[2]-3)/se; names(z) = "Z statistic"; z
pval = 2*(1-pnorm(abs(z))); names(pval) = "p-value"; pval

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 6. Regression with Random Explanatory Variables
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #


################## Simulate multivariate regression ##################
rm(list=ls()) # Remove everything

# Set parameter values
    # Regression coefficients
        beta10 = 1; beta11 = 1; beta12 = 0; beta13 = 3
        beta20 = 2; beta21 = 0; beta22 = 4; beta23 = 0
    # Expected values of x variables
        mux = c(10,20,10)
    # Variance-covariance matrix of x variables
        Phi = rbind(c(25,  25, 15),
                    c(25, 100, 35),
                    c(15,  35, 25))
    # Variance-covariance matrix of error terms
        Psi = rbind(c(500,  750),
                    c(750, 2000))

# Simulate data
source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt")
set.seed(9999)
# First an experiment
x = rmvn(nn=100000, mu=mux, sigma=Phi)
dim(x)
head(x)
apply(x,MARGIN=2,FUN=mean) # Column sample means
mux # Population means, for comparison
var(x) # Sample variance-covariance matrix with n-1
Phi # Population variance-covariance matrix, for comparison

# Now simulate data from the model
n = 500
x = rmvn(nn=n, mu=mux, sigma=Phi)
epsilon = rmvn(nn=n, mu=c(0,0), sigma=Psi)
# Extract variables (for clarity)
x1 = x[,1]; x2 = x[,2]; x3 = x[,3]
epsilon1 = epsilon[,1]; epsilon2 = epsilon[,2]
# Generate y
y1 = beta10 + beta11*x1 + beta12*x2 + beta13*x3 + epsilon1
y2 = beta20 + beta21*x1 + beta22*x2 + beta23*x3 + epsilon2
length(y1)

# Calculate MOM estimate of beta1 (the slopes)
y = cbind(y1,y2)
Sigmahat_x  = var(x) * (n-1)/n
Sigmahat_xy = var(x,y) * (n-1)/n
beta1hat = t(Sigmahat_xy) %*% solve(Sigmahat_x)
round(beta1hat,3)
# Now true beta1 for comparison
beta1 = rbind(c(beta11, beta12, beta13),
              c(beta21, beta22, beta23))
beta1

# MOM = Least squares
# MOM estimate of slopes
round(beta1hat,3)

# Least squares estimate
LSbetahat = lsfit(x,y)$coefficients                                     #$
t(round( LSbetahat ,3))

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 7. Omitted Variables
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# The age by strength plot omitting human versus chimp.
# Simpson's paradox

rm(list=ls())
source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt")
set.seed(9999)
n = 100 # Each
S0 = rbind(c(1.00,9.60),
           c(9.60,144.0))
chimps = rmvn(n,c(5.5,300),S0)
humans = rmvn(n,c(17.5,100),S0)
Age = c(chimps[,1],humans[,1])
Strength = c(chimps[,2],humans[,2])
plot(Age,Strength,pch=' ')
title("Age and Strength")
points(chimps,pch='*'); points(humans,pch='o')
# summary(lm(Strength~Age))
x = c(2,20); y = 383.8795 - 16.1797 * x
lines(x,y)
text(9,300,"Chimps",font=2); text(14,100,"Humans",font=2)


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 8. Credit Card Data with lavaan (Instrumental Variables)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# Income and credit card debt
# For a sample of real estate agents in different towns and cities, record
    # HomePrice:    Median price of resale home, in 100k units
    # Income:       Income last year after taxes, in thousands
    # CardDebt:     Credit card balance carried forward, in thousands

rm(list=ls())
cards = read.table("https://www.utstat.toronto.edu/~brunner/openSEM/data/cards2.data.txt")

# Explore the data
head(cards); dim(cards)
summary(cards)
cor(cards)
with(cards, hist(HomePrice) )
with(cards, hist(Income) )
with(cards, hist(CardDebt) )
pairs(cards)

# Generate the graphics directly
jpeg(file = 'hist1.jpg'); with(cards, hist(HomePrice) ); dev.off()
jpeg(file = 'hist2.jpg'); with(cards, hist(Income) ); dev.off()
jpeg(file = 'hist3.jpg'); with(cards, hist(CardDebt) ); dev.off()
jpeg(file = 'pairs.jpg'); with(cards, pairs(cards) ); dev.off()

# Ordinary regression
summary(lm(CardDebt ~ Income, data=cards))

# "Control" for median house price
summary(lm(CardDebt ~ Income + HomePrice, data=cards))

# Now lavaan
# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

# Ordinary regression with random X 
mod0 = 'CardDebt ~ beta0*1 + beta1*Income
          Income ~~ Phi*Income     # Var(Income) = phi
          Income ~ mu*1            # E(Income) = mu
          CardDebt ~~ psi*CardDebt # Var(epsilon) = psi
         ' # End of model string

fit0 = lavaan(mod0, data=cards)
summary(fit0)
summary(lm(CardDebt ~ Income, data=cards))

1.883^2 # MSE from lm
c( mean(cards$Income), var(cards$Income) )

# Instrumental variables model (Called Model Three in lecture)
mod1 = 'CardDebt ~ beta0*1 + beta1*Income
          Income ~ mux*1                # E(Income)      = mux
          HomePrice ~ muz*1             # E(HomePrice)   = muz
          Income ~~ phix*Income         # Var(Income)    = phix
          HomePrice ~~ phiz*HomePrice   # Var(HomePrice) = phiz
          Income ~~ kappa*HomePrice     # Cov(Income,HomePrice) = kappa
          CardDebt ~~ psi*CardDebt      # Var(epsilon)   = psi
          Income ~~ c*CardDebt          # Cov(Income,epsilon)   = c
         ' # End of model string
fit1 = lavaan(mod1, data=cards)
summary(fit1)

# Checking explicit formula for the MLE.
# (x,y,z) = (Income, CardDebt, HomePrice)

# LaTeX
# \widehat{\boldsymbol{\beta}}_1  =  \widehat{\boldsymbol{\Sigma}}_{23} 
# \widehat{\boldsymbol{\Sigma}}_{13}^{-1}

with(cards, var(CardDebt,HomePrice) / var(Income,HomePrice) )

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 10.  Simulation of Measurement Error in the Explanatory Variables
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# Simulate regression with measurement error in x variables

source("https://www.utstat.toronto.edu/~brunner/openSEM/fun/rmvn.txt")

n = 500 # Sample size
# Regression coefficients
beta0 = 1; beta1 = 1; beta2 = 0
# Expected values of x variables
mux = c(10,20)
# Variance-covariance matrix of x variables: Correlation = 0.75
Phi = rbind(c(20,  30),
            c(30,  80))
# Variance of epsilon
psi = 50
# Variances of measurement error terms: Both reliabilities = 0.80
omega1 = 5; omega2 = 20

# Simulate data
set.seed(9999)

X = rmvn(n,mux,Phi); head(X)
x1 = X[,1]; x2 = X[,2]
epsilon = rnorm(n,0,sqrt(psi))
e1 = rnorm(n,0,sqrt(omega1)); e2 = rnorm(n,0,sqrt(omega2))

y = beta0 + beta1*x1 + beta2*x2 + epsilon
w1 = x1 + e1
w2 = x2 + e2

d = cbind(w1,w2,y); cor(d)

mod = lm(y ~ w1 + w2)
summary(mod)
summary(mod)$coefficients[3,4]

onesim = function()
    {
    X = rmvn(n,mux,Phi); head(X)
    x1 = X[,1]; x2 = X[,2]
    epsilon = rnorm(n,0,sqrt(psi))
    e1 = rnorm(n,0,sqrt(omega1)); e2 = rnorm(n,0,sqrt(omega2))
    y = beta0 + beta1*x1 + beta2*x2 + epsilon
    w1 = x1 + e1
    w2 = x2 + e2
    mod = lm(y ~ w1 + w2)
    return(summary(mod)$coefficients[3,4])
    } # End of onesim

onesim()

#########

mereg <- function(beta0=1, beta1=1, beta2=0, sigmasq = 0.5, 
          mu1=0, mu2=0, phi11=1, phi22=1, phi12 = 0.80, 
          rel1=0.80, rel2=0.80, n=200)
####################################################################
# Model is   Y  = beta0 + beta1 X1 + beta2 X2 + epsilon
#            W1 = X1 + e1
#            W2 = W2 + e2
# Fit naive model 
#            Y  = beta0 + beta1 W1 + beta2 W2 + epsilon
# Inputs are
#
#   beta0, beta1 beta2      True regression coefficients
#   sigmasq                 Var(epsilon)
#   mu1                     E(X1)
#   mu2                     E(X2)
#   phi11                   Var(X1)
#   phi22                   Var(X2)
#   phi12                   Cov(X1,X2) = Corr(X1,X1), because 
#                           Var(X1) = Var(X2) = 1
#   rel1                    Reliability of W1
#   rel2                    Reliability of W2
#   n                       Sample size
# Note: This function uses rmvn, a multivariate normal random number
#       generator I wrote. The rmultnorm of the package MSBVAR does 
#       the same thing but I am having trouble installing it.
####################################################################

    {
    # Calculate SD(e1) and SD(e2)
    sd1 <- sqrt((phi11-rel1)/rel1) 
    sd2 <- sqrt((phi22-rel2)/rel2) 
    # Random number generation
    epsilon <- rnorm(n,mean=0,sd=sqrt(sigmasq))
    e1 <- rnorm(n,mean=0,sd=sd1)
    e2 <- rnorm(n,mean=0,sd=sd2)
    # X1 and X2 are bivariate normal. Need rmvn function.
    Phi <- rbind(c(phi11,phi12),  
                 c(phi12,phi22))
    X <- rmvn(n, mu=c(mu1,mu2), sigma=Phi) # nx2 matrix
    X1 <- X[,1]; X2 <- X[,2]
    # Now generate Y, W1 and W2
    
    Y = beta0 + beta1*X1 + beta2*X2 + epsilon
    W1 = X1 + e1
    W2 = X2 + e2
    
    # Fit the naive model
    mereg <- summary(lm(Y~W1+W2))$coefficients
    mereg # Returns table of beta-hats, SEs, t-statistics and p-values
    } # End function mereg


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 13. Little double measurement regression with lavaan
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

rm(list=ls()); options(scipen=999)
# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

babydouble = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/Babydouble.data.txt")
head(babydouble)
dim(babydouble)
summary(babydouble)
cor(babydouble)

dmodel1 =  'Y ~ beta1*X         # Latent variable model (even though Y is observed)
            X =~ 1*W1 + 1*W2    # Measurement model
            # Variances (covariances would go here too)
            X~~phi*X      # Var(X) = phi
            Y~~psi*Y      # Var(epsilon) = psi
            W1~~omega1*W1 # Var(e1) = omega1
            W2~~omega2*W2 # Var(e2) = omega2
            ' 
dfit1 = lavaan(dmodel1, data=babydouble)
summary(dfit1) 

parameterEstimates(dfit1)
parTable(dfit1)
coef(dfit1) # A vector of MLEs
fitted(dfit1) # Sigma(thetahat) and mu(thetahat)

vcov(dfit1)
logLik(dfit1)

# Fit a restricted model (restricted by H0)
dfit1r = lavaan(dmodel1, data=babydouble, constraints = 'omega1==omega2')
anova(dfit1r,dfit1)
# Put multiple constraints on separate lines, like this. 
dfit1r2 = lavaan(dmodel1, data=babydouble, constraints = 'omega1==omega2
                                                          phi==1')
anova(dfit1r2,dfit1)


# For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
source("http://www.utstat.utoronto.ca/~brunner/Rfunctions/Wtest.txt")
LL = cbind(0,0,0,1,-1); LL
Wtest(LL,coef(dfit1),vcov(dfit1))

# Non-linear functions of the parameters with :=
dmodel1b = 'Y ~ beta1*X         # Latent variable model
            X =~ 1*W1 + 1*W2    # Measurement model
            # Variances (covariances would go here too)
            X~~phi*X      # Var(X) = phi
            Y~~psi*Y      # Var(epsilon) = psi
            W1~~omega1*W1 # Var(e1) = omega1
            W2~~omega2*W2 # Var(e2) = omega2
            diff := omega1-omega2
            rel1 := phi/(omega1+phi)
            ' 
dfit1b = lavaan(dmodel1b, data=babydouble)
parameterEstimates(dfit1b)
sqrt(0.01918586) # Compare Z statistic for H0: omega1=omega2


# And one attempt to fit a non-identified model

dmodel0 =  'Y ~ beta1*X    # Latent variable model (even though Y is observed)
            X =~ 1*W1      # Measurement model
            # Variances (covariances would go here too)
            X~~phi*X      # Var(X) = phi
            Y~~psi*Y      # Var(epsilon) = psi
            W1~~omega1*W1 # Var(e1) = omega1
            ' 
dfit0 = lavaan(dmodel0, data=babydouble)
summary(dfit0)


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
14. Bootstrap
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# Testing differences between error variances, to see
# if we have equivalent measurements.

rm(list=ls())
# Parameter values and sample size
phi = 7; omega1 = 3; omega2 = 3
rel1 = round(phi/(phi+omega1),3); rel2 = round(phi/(phi+omega2),3)
c(rel1,rel2) # Reliabilities
n = 1500
# Simulate from t distribution -- heavy tails
# Var(t) = nu/(nu-2)  
set.seed(9999)
x = sqrt(phi) * rt(n,3)/sqrt(3)
e1 = sqrt(omega1) * rt(n,3)/sqrt(3); e2 = sqrt(omega2) * rt(n,3)/sqrt(3) 
w1 = x + e1; w2 = x + e2
ww = cbind(w1,w2)
vcovW = var(ww) * (n-1)/n; vcovW # Divide by n to get MLEs

# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

# Normal theory with lavaan
mod =   "x =~ 1.0*w1 + 1.0*w2
         x ~~ phi*x; w1 ~~ omega1*w1; w2 ~~ omega2*w2 
         vardiff := omega1-omega2
         "
fit = lavaan(mod, data=ww)
# summary(fit)
parameterEstimates(fit)
thetahat = coef(fit); thetahat

# Bootstrap the "hard" way
# n = dim(ww)[1] is not needed
jar = 1:n; B = 1000
tstar = matrix(NA,B,3) # Rows will hold theta-hat values
colnames(tstar) = names(coef(fit))
for(j in 1:B)
    {
    rowz = sample(jar,size=n,replace=TRUE)
    xstar = ww[rowz,]
    fitstar = lavaan(mod, data=xstar)
    tstar[j,] = coef(fitstar)
    } # Next bootstrap sample
head(tstar)
vdiff = tstar[,2] - tstar[,3] # Vector of omega1hat - omega2hat values

hist(vdiff)
shapiro.test(vdiff) # Test of normality
var(vdiff) 
bootse = sqrt(var(vdiff))
bootse # Compare normal theory estimate of 0.364

z = (thetahat[2]-thetahat[3])/bootse; z # Compare z = 2.124

# Now bootstrap with lavaan: The easy way
fitB = lavaan(mod, data=ww, se = "bootstrap")
parameterEstimates(fitB)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 15. The BMI Health Study with lavaan
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

rm(list=ls()); options(scipen=999)
bmidata = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bmi.data.txt")

head(bmidata)
dim(bmidata)

###################################
# Naive surface regression        #
###################################

bmi = within(bmidata,{
age = (age1+age2)/2; bmi = (bmi1+bmi2)/2; fat = (fat1+fat2)/2
cholest = (cholest1+cholest2)/2; diastol = (diastol1+diastol2)/2
})
dim(bmi)

fullmod = lm( cbind(cholest,diastol) ~ age + fat + bmi, data=bmi)
summary(fullmod)

restrictedmod = update(fullmod, . ~ . - bmi) # Remove var(s) being tested
anova(fullmod,restrictedmod) # Gives multivariate test.

###################################
# Structural equation model       #
###################################
# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

bmimodel1 =  
   '########################################################
    # Latent variable model
    # ---------------------
    Lcholest ~ beta11*Lage + beta12*Lbmi + beta13*Lfat 
    Ldiastol ~ beta21*Lage + beta22*Lbmi + beta23*Lfat
    #
    # Measurement model
    # -----------------
    Lage =~ 1*age1 + 1*age2                            
    Lbmi =~ 1*bmi1 + 1*bmi2
    Lfat =~ 1*fat1 +1*fat2
    Lcholest =~ 1*cholest1 + 1*cholest2
    Ldiastol =~ 1*diastol1 + 1*diastol2
    #
    # Variances and covariances
    # -------------------------
    # Of latent explanatory variables
    Lage ~~ phi11*Lage; Lage ~~ phi12*Lbmi; Lage ~~ phi13*Lfat
                        Lbmi ~~ phi22*Lbmi; Lbmi ~~ phi23*Lfat
                                            Lfat ~~ phi33*Lfat
    # Of error terms in latent the regression (epsilon_ij)
    Lcholest ~~ psi11*Lcholest; Lcholest ~~ psi12*Ldiastol 
                                Ldiastol ~~ psi22*Ldiastol
    # Of measurement errors (e_ijk) for measurement set 1
    age1 ~~ w111*age1; age1 ~~ w112*bmi1; age1 ~~ w113*fat1; 
    age1 ~~ w114*cholest1; age1 ~~ w115*diastol1
            bmi1 ~~ w122*bmi1; bmi1 ~~ w123*fat1; bmi1 ~~ w124*cholest1; bmi1 ~~ w125*diastol1
                    fat1 ~~ w133*fat1; fat1 ~~ w134*cholest1; fat1 ~~ w135*diastol1
                            cholest1 ~~ w144*cholest1; cholest1 ~~ w145*diastol1
                                                       diastol1 ~~ w155*diastol1
    # Of measurement errors (e_ijk) for measurement set 2
    age2 ~~ w211*age2; age2 ~~ w212*bmi2; age2 ~~ w213*fat2; 
    age2 ~~ w214*cholest2; age2 ~~ w215*diastol2
            bmi2 ~~ w222*bmi2; bmi2 ~~ w223*fat2; bmi2 ~~ w224*cholest2; bmi2 ~~ w225*diastol2
                    fat2 ~~ w233*fat2; fat2 ~~ w234*cholest2; fat2 ~~ w235*diastol2
                            cholest2 ~~ w244*cholest2; cholest2 ~~ w245*diastol2
                                                       diastol2 ~~ w255*diastol2
   ' ################# End of bmimodel1 #################

fit1 = lavaan(bmimodel1, data=bmidata)
# Lots of warnings, 4241 iterations
summary(fit1)
p1 = parTable(fit1); p1

fit1B = lavaan(bmimodel1, data=bmidata, start = 'Mplus')
# Same warnings, 4241 iterations again

# Start at Method of Moment estimates.

head(bmidata)
W1 = as.matrix(bmidata[,1:3])  # age1 bmi1 fat1
V1 = as.matrix(bmidata[,4:5])  # cholest1 diastol1
W2 = as.matrix(bmidata[,6:8])  # age2 bmi2 fat2
V2 = as.matrix(bmidata[,9:10]) # cholest2 diastol2
var(W1,W2) # Matrix of sample covariances
# Using S as short for Sigmahat, and not worrying about n vs. n-1,
S11 = var(W1); S12 = var(W1,V1); S13 = var(W1,W2); S14 = var(W1,V2)
               S22 = var(V1);    S23 = var(V1,W2); S24 = var(V1,V2)
                                 S33 = var(W2);    S34 = var(W2,V2)
                                                   S44 = var(V2)
# The matrices below should all have "hat" in the name, because they are estimates
Phi = (S13+t(S13))/2
rownames(Phi) = colnames(Phi) = c('Lage','Lbmi','Lfat'); Phi
# To my surprise, these are quite close to the MLEs from the first run.
Beta = 0.5*(t(S14)+S23) %*% solve(Phi)
rownames(Beta) = c('Lcholest','Ldiastol')
colnames(Beta) = c('Lage','Lbmi','Lfat'); Beta
# These are miles away from the supposed MLEs
# Can just say some of the rest are close and others are not. 

Psi = S24 - Beta %*% Phi %*% t(Beta)
rownames(Psi) = colnames(Psi) = c('Lcholest','Ldiastol') # epsilon1, epsilon2
Psi
# Oops, it should be symmetric.
Psi = ( Psi+t(Psi) )/2; Psi
# Again, far away. 
Omega11 = S11 - Phi; Omega11
# Supposed MLEs are pretty close here.
Omega12 = S12 - ( S14+t(S23) )/2; Omega12 # Not too bad
Omega22 = S22-S24 # A little rough but consistent
Omega22 = (Omega22 + t(Omega22)  )/2
Omega22 # Variances okay, covariance off.
Omega33 = S33 - Phi; Omega33 # Not too bad
Omega34 = S34 - ( S14+t(S23) )/2; Omega34 # Not too bad
Omega44 = S44 - S24 ; Omega44 = ( Omega44 + t(Omega44) )/2
Omega44 # Not terrible

# Carefully assemble the MOM estimates into a vector, same order as p1 = parTable(fit1)

mom = c(Beta[1,1], Beta[1,2], Beta[1,3], Beta[2,1], Beta[2,2], Beta[2,3],
        1,1,1,1,1,1,1,1,1,1,
        Phi[1,1], Phi[1,2], Phi[1,3], Phi[2,2], Phi[2,3], Phi[3,3],
        Psi[1,1], Psi[1,2], Psi[2,2], 
        Omega11[1,1], Omega11[1,2], Omega11[1,3], Omega12[1,1], Omega12[1,2], 
        Omega11[2,2], Omega11[2,3], Omega12[2,1], Omega12[2,2],
        Omega11[3,3], Omega12[3,1], Omega12[3,2],
        Omega22[1,1], Omega22[1,2], Omega22[2,2], 
        Omega33[1,1], Omega33[1,2], Omega33[1,3], Omega34[1,1], Omega34[1,2], 
        Omega33[2,2], Omega33[2,3], Omega34[2,1], Omega34[2,2],
        Omega33[3,3], Omega34[3,1], Omega34[3,2],
        Omega44[1,1], Omega44[1,2], Omega44[2,2]      )
length(mom)

p1mom = p1; p1mom[,14] = mom # Replace column 14 (est) with mom

fit2 = lavaan(bmimodel1, data=bmidata, start = p1mom); fit2
summary(fit2)

parTable(fit2)

# Same results, a LOT less mess and effort. This is a notable success.

# Exactly on the money; Chi-squared = 4.6537. It took 242 iterations with these very good starting values. SAS took 5 with its automatic starting values. 

# Now a LR test of BMI, H0: beta12 = beta22 = 0 
# Will the constraint conflict with the starting values?

nobmi = lavaan(bmimodel1, data=bmidata, start = p1mom,
            constraints = 'beta12 == 0
                           beta22 == 0')
anova(nobmi,fit2)

# Two follow-up issues
    # Shorter syntax
    # Bootstrap

cat(bmimodel1)

bmimodel2 =  
   '########################################################
    # Latent variable model
    # ---------------------
    Lcholest ~ beta11*Lage + beta12*Lbmi + beta13*Lfat 
    Ldiastol ~ beta21*Lage + beta22*Lbmi + beta23*Lfat
    #
    # Measurement model
    # -----------------
    Lage =~ 1*age1 + 1*age2; Lbmi =~ 1*bmi1 + 1*bmi2;  Lfat =~ 1*fat1 +1*fat2                     
    Lcholest =~ 1*cholest1 + 1*cholest2; Ldiastol =~ 1*diastol1 + 1*diastol2
    #
    # Variances and covariances DO NOT NEED NAMES!
    # -------------------------
    # Of latent explanatory variables
    Lage ~~ Lage +  Lbmi +  Lfat
            Lbmi ~~ Lbmi +  Lfat
                    Lfat ~~ Lfat
    # Of error terms in the latent regression
    Lcholest ~~ Lcholest +  Ldiastol 
                Ldiastol ~~ Ldiastol
    # Of measurement errors for measurement set 1
    age1 ~~ age1 +  bmi1 +  fat1 + cholest1 + diastol1
            bmi1 ~~ bmi1 +  fat1 + cholest1 + diastol1
                    fat1 ~~ fat1 + cholest1 + diastol1
                            cholest1 ~~ cholest1 + diastol1
                                        diastol1 ~~ diastol1
    # Of measurement errors for measurement set 2
    age2 ~~ age2 +  bmi2 +  fat2 + cholest2 + diastol2
            bmi2 ~~ bmi2 +  fat2 + cholest2 + diastol2
                    fat2 ~~ fat2 + cholest2 + diastol2
                            cholest2 ~~ cholest2 + diastol2
                                        diastol2 ~~ diastol2
   ' ################# End of bmimodel2 #################

fit3 = lavaan(bmimodel2, data=bmidata, start = fit2); fit3
summary(fit3)
summary(fit2) # for comparison

head(parTable(fit2))
head(parTable(fit3))

# Even shorter syntax: Do we really need the beta symbols?
bmimodel3 =  
   '#### Regressions ####
    Lcholest ~ Lage + Lbmi + Lfat 
    Ldiastol ~ Lage + Lbmi + Lfat
    #### Measurement model ####
    Lage =~ 1*age1 + 1*age2; Lbmi =~ 1*bmi1 + 1*bmi2;  Lfat =~ 1*fat1 +1*fat2                     
    Lcholest =~ 1*cholest1 + 1*cholest2; Ldiastol =~ 1*diastol1 + 1*diastol2
    #### Variances and covariances ####
    Lage ~~ Lage +  Lbmi +  Lfat
            Lbmi ~~ Lbmi +  Lfat
                    Lfat ~~ Lfat
    Lcholest ~~ Lcholest +  Ldiastol 
                Ldiastol ~~ Ldiastol
    age1 ~~ age1 +  bmi1 +  fat1 + cholest1 + diastol1
            bmi1 ~~ bmi1 +  fat1 + cholest1 + diastol1
                    fat1 ~~ fat1 + cholest1 + diastol1
                            cholest1 ~~ cholest1 + diastol1
                                        diastol1 ~~ diastol1
    age2 ~~ age2 +  bmi2 +  fat2 + cholest2 + diastol2
            bmi2 ~~ bmi2 +  fat2 + cholest2 + diastol2
                    fat2 ~~ fat2 + cholest2 + diastol2
                            cholest2 ~~ cholest2 + diastol2
                                        diastol2 ~~ diastol2
   ' ################# End of bmimodel3 #################

fit4 = lavaan(bmimodel3, data=bmidata, start = fit2); fit4
summary(fit4)

# Bootstrap
fit3B = lavaan(bmimodel2, data=bmidata, start = fit2, se = "bootstrap")
# Waited between 2 and 3 minutes
summary(fit3B)


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 18. Brand Awareness with lavaan
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

rm(list=ls()); options(scipen=999)
# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

coffee = read.table("http://www.utstat.toronto.edu/brunner/openSEM/data/timmy1.data.txt")
head(coffee)

# Observed variables
#   w1 = Brand Awareness 1
#   w2 = Brand Awareness 2
#   w3 = Ad Awareness 1
#   w4 = Ad Awareness 2
#   w5 = Interest 1
#   w6 = Interest 2
#   v1 = Purchase Intention 1
#   v2 = Purchase Intention 2
#   v3 = Purchase Behaviour 1
#   v4 = Purchase Behaviour 2
# Latent variables
#   L_BrAw  = True brand awareness
#   L_AdAw  = True advertising awareness
#   L_Inter = True interest in the product category
#   L_PI    = True purchase intention
#   L_PBeh  = True purchase behaviour

torus1 = 
'
# Latent variable model
    L_PI ~ gamma1*L_BrAw + gamma2*L_AdAw + gamma3*L_Inter
    L_PBeh    ~ gamma4*L_Inter + beta*L_PI
# Measurement model (simple double measurement)                  
    L_BrAw  =~ 1*w1 + 1*w2
    L_AdAw  =~ 1*w3 + 1*w4
    L_Inter =~ 1*w5 + 1*w6
    L_PI    =~ 1*v1 + 1*v2
    L_PBeh  =~ 1*v3 + 1*v4
# Variances  and covariances 
    # Exogenous latent variables
        L_BrAw ~~ phi11*L_BrAw   # Var(L_BrAw)         = phi11
        L_BrAw ~~ phi12*L_AdAw   # Cov(L_BrAw,L_AdAw)  = phi12 
        L_BrAw ~~ phi13*L_Inter  # Cov(L_BrAw,L_Inter) = phi13
        L_AdAw ~~ phi22*L_AdAw   # Var(L_AdAw)         = phi22
        L_AdAw ~~ phi23*L_Inter  # Cov(L_AdAw,L_Inter) = phi23
        L_Inter ~~ phi33*L_Inter # Var(L_Inter)        = phi33
    # Errors in the latent model (epsilons)
        L_PI ~~ psi1*L_PI        # Var(epsilon1) = psi1
        L_PBeh ~~ psi2*L_PBeh    # Var(epsilon2) = psi2
    # Measurement errors
        w1 ~~ omega1*w1          # Var(e1)  = omega1
        w2 ~~ omega2*w2          # Var(e2)  = omega2
        w3 ~~ omega3*w3          # Var(e3)  = omega3
        w4 ~~ omega4*w4          # Var(e4)  = omega4
        w5 ~~ omega5*w5          # Var(e5)  = omega5
        w6 ~~ omega6*w6          # Var(e6)  = omega6
        v1 ~~ omega7*v1          # Var(e7)  = omega7
        v2 ~~ omega8*v2          # Var(e8)  = omega8
        v3 ~~ omega9*v3          # Var(e9)  = omega9
        v4 ~~ omega10*v4         # Var(e10) = omega10
# Bounds (Variances are positive)
    phi11 > 0; phi22 > 0; phi33 > 0
    psi1 > 0; psi2 > 0
    omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 
    omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 
' # End of model torus1

fit1 = lavaan(torus1, data=coffee)
show(fit1)

# It did not fit, and matched SAS.  
# Split the problem up into parts. Look first at the measurement model.

torus2 = 
'
# Measurement model (still simple double measurement)                  
    L_BrAw  =~ 1*w1 + 1*w2
    L_AdAw  =~ 1*w3 + 1*w4
    L_Inter =~ 1*w5 + 1*w6
    L_PI    =~ 1*v1 + 1*v2
    L_PBeh  =~ 1*v3 + 1*v4
# Variances and covariances 
    # Latent variables
        L_BrAw ~~ phi11*L_BrAw      # Var(L_BrAw)          = phi11
        L_BrAw ~~ phi12*L_AdAw      # Cov(L_BrAw, L_AdAw)  = phi12
        L_BrAw ~~ phi13*L_Inter     # Cov(L_BrAw, L_Inter) = phi13
        L_BrAw ~~ phi14*L_PI        # Cov(L_BrAw, L_PI)    = phi14
        L_BrAw ~~ phi15*L_PBeh      # Cov(L_BrAw, L_PBeh)  = phi15

        L_AdAw ~~ phi22*L_AdAw      # Var(L_AdAw)          = phi22
        L_AdAw ~~ phi23*L_Inter     # Cov(L_AdAw, L_Inter) = phi23
        L_AdAw ~~ phi24*L_PI        # Cov(L_AdAw, L_PI)    = phi24
        L_AdAw ~~ phi25*L_PBeh      # Cov(L_AdAw, L_PBeh)  = phi25

        L_Inter ~~ phi33*L_Inter    # Var(L_Inter)         = phi33
        L_Inter ~~ phi34*L_PI       # Cov(L_Inter, L_PI)   = phi34
        L_Inter ~~ phi35*L_PBeh     # Cov(L_Inter, L_PBeh) = phi35

        L_PI ~~ phi44*L_PI          # Var(L_PI)            = phi44
        L_PI ~~ phi45*L_PBeh        # Cov(L_PI, L_PBeh)    = phi45

        L_PBeh ~~ phi55*L_PBeh      # Var(L_PBeh)          = phi55
    # Measurement errors
        w1 ~~ omega1*w1          # Var(e1)  = omega1
        w2 ~~ omega2*w2          # Var(e2)  = omega2
        w3 ~~ omega3*w3          # Var(e3)  = omega3
        w4 ~~ omega4*w4          # Var(e4)  = omega4
        w5 ~~ omega5*w5          # Var(e5)  = omega5
        w6 ~~ omega6*w6          # Var(e6)  = omega6
        v1 ~~ omega7*v1          # Var(e7)  = omega7
        v2 ~~ omega8*v2          # Var(e8)  = omega8
        v3 ~~ omega9*v3          # Var(e9)  = omega9
        v4 ~~ omega10*v4         # Var(e10) = omega10
# Bounds (Variances are positive)
    phi11 > 0; phi22 > 0; phi33 > 0; phi44 > 0; phi55 > 0
    omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 
    omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 
' # End of model torus2

fit2 = lavaan(torus2, data=coffee)
show(fit2)

# Now try to specify the model more briefly, without naming all the variances and covariances of latent variables. By default, the cfa function makes the loading of the first indicator = 1, and all latent variables are correlated. "If the argument std.lv=TRUE is used, the factor loadings of the first indicator of each latent variable will no longer be fixed to 1."

torus2b = 
'
# Measurement model (still simple double measurement)                  
    L_BrAw  =~ 1*w1 + 1*w2
    L_AdAw  =~ 1*w3 + 1*w4
    L_Inter =~ 1*w5 + 1*w6
    L_PI    =~ 1*v1 + 1*v2
    L_PBeh  =~ 1*v3 + 1*v4
# Leave off everything else and see what happens.
' # End of model torus2b
fit2b = cfa(torus2b, data=coffee)

show(fit2)
show(fit2b)

# The measurement model does not fit. Try a true double measurement model, allowing covariances within sets.

torus3 = 
'
# Measurement model (still simple double measurement)                  
    L_BrAw  =~ 1*w1 + 1*w2
    L_AdAw  =~ 1*w3 + 1*w4
    L_Inter =~ 1*w5 + 1*w6
    L_PI    =~ 1*v1 + 1*v2
    L_PBeh  =~ 1*v3 + 1*v4
# Add covariances between measurement error terms, without naming them
    w1 ~~ w3; w1 ~~ w5; w1 ~~ v1; w1 ~~ v3
              w3 ~~ w5; w3 ~~ v1; w3 ~~ v3
                        w5 ~~ v1; w5 ~~ v3
                                  v1 ~~ v3
    w2 ~~ w4; w2 ~~ w6; w2 ~~ v2; w2 ~~ v4
              w4 ~~ w6; w4 ~~ v2; w4 ~~ v4
                        w6 ~~ v2; w6 ~~ v4
                                  v2 ~~ v4
' # End of model torus3
fit3 = cfa(torus3, data=coffee)
lavInspect(fit3, "theta")
# eigen(lavInspect(fit3, "theta"))$values

# Checking why torus3 left the parameter space.
# Obtain MOM estimates for use as starting values.

d1 = as.matrix(coffee[,c(1,3,5,7,9)]) # Measurement set one
d2 = as.matrix(coffee[,c(2,4,6,8,10)]) # Measurement set two
Phi_hat = cov(d1,d2); Phi_hat
# Make it symmetric
Phi_hat = (Phi_hat + t(Phi_hat) )/2; Phi_hat
eigen(Phi_hat)$values # Is it positive definite?
Omega1_hat = cov(d1) - Phi_hat
Omega2_hat = cov(d2) - Phi_hat
eigen(Omega1_hat)$values # Is Omega1_hat positive definite?
eigen(Omega2_hat)$values # Is Omega2_hat positive definite?

anova(fit2,fit3)

torus4 = 
'
# Measurement model              
    L_BrAw  =~ 1*w1 + lambda2*w2
    L_AdAw  =~ 1*w3 + lambda4*w4
    L_Inter =~ 1*w5 + lambda6*w6
    L_PI    =~ 1*v1 + lambda8*v2
    L_PBeh  =~ 1*v3 + lambda10*v4
' # End of model torus4
fit4 = cfa(torus4, data=coffee)
show(fit4)

# Just edit the measurement model part of model 1
torus5 = 
'
# Latent variable model
    L_PI ~ gamma1*L_BrAw + gamma2*L_AdAw + gamma3*L_Inter
    L_PBeh    ~ gamma4*L_Inter + beta*L_PI
# Measurement model              
    L_BrAw  =~ 1*w1 + lambda2*w2
    L_AdAw  =~ 1*w3 + lambda4*w4
    L_Inter =~ 1*w5 + lambda6*w6
    L_PI    =~ 1*v1 + lambda8*v2
    L_PBeh  =~ 1*v3 + lambda10*v4
# Variances  and covariances 
    # Exogenous latent variables
        L_BrAw ~~ phi11*L_BrAw   # Var(L_BrAw)         = phi11
        L_BrAw ~~ phi12*L_AdAw   # Cov(L_BrAw,L_AdAw)  = phi12 
        L_BrAw ~~ phi13*L_Inter  # Cov(L_BrAw,L_Inter) = phi13
        L_AdAw ~~ phi22*L_AdAw   # Var(L_AdAw)         = phi22
        L_AdAw ~~ phi23*L_Inter  # Cov(L_AdAw,L_Inter) = phi23
        L_Inter ~~ phi33*L_Inter # Var(L_Inter)        = phi33
    # Errors in the latent model (epsilons)
        L_PI ~~ psi1*L_PI        # Var(epsilon1) = psi1
        L_PBeh ~~ psi2*L_PBeh    # Var(epsilon2) = psi2
    # Measurement errors
        w1 ~~ omega1*w1          # Var(e1)  = omega1
        w2 ~~ omega2*w2          # Var(e2)  = omega2
        w3 ~~ omega3*w3          # Var(e3)  = omega3
        w4 ~~ omega4*w4          # Var(e4)  = omega4
        w5 ~~ omega5*w5          # Var(e5)  = omega5
        w6 ~~ omega6*w6          # Var(e6)  = omega6
        v1 ~~ omega7*v1          # Var(e7)  = omega7
        v2 ~~ omega8*v2          # Var(e8)  = omega8
        v3 ~~ omega9*v3          # Var(e9)  = omega9
        v4 ~~ omega10*v4         # Var(e10) = omega10
# Bounds (Variances are positive)
    phi11 > 0; phi22 > 0; phi33 > 0
    psi1 > 0; psi2 > 0
    omega1 > 0; omega2 > 0; omega3 > 0; omega4 > 0; omega5 > 0 
    omega6 > 0; omega7 > 0; omega8 > 0; omega9 > 0; omega10 > 0 
' # End of model torus5
fit5 = lavaan(torus5, data=coffee)

# Ouch, seems to have left the parameter space. Suspect problems with the latent variable model, or with the starting values.
summary(fit5)
coef(fit4)

lavInspect(fit5, "cov.lv")
lavInspect(fit4, "cov.lv") # True MLE 


# Careful examination makes me suspect the problem is with the first stage of the latent model. Try for good starting values.

# Based on the sub-model y1 = gamma^t x + epsilon1 (details omitted) 
# Get estimates of gamma and psi1 = Var(epsilon1)


# The names of all these quantities should include "hat."
Phi = lavInspect(fit4, "cov.lv")
Phix = Phi[1:3,1:3]; Phix
Phixy = as.matrix(Phi[1:3,4]); Phixy
gamma = t(Phixy) %*% solve(Phix); gamma
psi1 = Phi[4,4] - as.numeric(gamma %*% Phix %*% t(gamma)); psi1

# These numbers are much more reasonable. See if I can get away with specifying just 10 starting values. Drop the inequality constraints too, since lavaan will issue a warning if any variance estimate is negative.

torus6 = 
'
# Latent variable model
    L_PI ~ gamma1*L_BrAw + start(0.1996458)*L_BrAw +
    gamma2*L_AdAw        + start(0.3932861)*L_AdAw +
    gamma3*L_Inter       + start(0.5622287)*L_Inter
    L_PBeh    ~ gamma4*L_Inter + beta*L_PI
# Measurement model              
    L_BrAw  =~ 1*w1 + lambda2*w2
    L_AdAw  =~ 1*w3 + lambda4*w4
    L_Inter =~ 1*w5 + lambda6*w6
    L_PI    =~ 1*v1 + lambda8*v2
    L_PBeh  =~ 1*v3 + lambda10*v4
# Variances  and covariances 
    # Exogenous latent variables
        L_BrAw ~~ phi11*L_BrAw   + start(19.13510)*L_BrAw  # Var(L_BrAw)         = phi11
        L_BrAw ~~ phi12*L_AdAw   + start(12.29660)*L_AdAw  # Cov(L_BrAw,L_AdAw)  = phi12 
        L_BrAw ~~ phi13*L_Inter  + start(13.50213)*L_Inter # Cov(L_BrAw,L_Inter) = phi13
        L_AdAw ~~ phi22*L_AdAw   + start(15.91372)*L_AdAw  # Var(L_AdAw)         = phi22
        L_AdAw ~~ phi23*L_Inter  + start(11.30579)*L_Inter # Cov(L_AdAw,L_Inter) = phi23
        L_Inter ~~ phi33*L_Inter + start(14.98033)*L_Inter # Var(L_Inter)        = phi33
    # Errors in the latent model (epsilons)
        L_PI ~~ psi1*L_PI + start(3.206661)*L_PI   # Var(epsilon1) = psi1
        L_PBeh ~~ psi2*L_PBeh                      # Var(epsilon2) = psi2
    # Measurement errors
        w1 ~~ omega1*w1          # Var(e1)  = omega1
        w2 ~~ omega2*w2          # Var(e2)  = omega2
        w3 ~~ omega3*w3          # Var(e3)  = omega3
        w4 ~~ omega4*w4          # Var(e4)  = omega4
        w5 ~~ omega5*w5          # Var(e5)  = omega5
        w6 ~~ omega6*w6          # Var(e6)  = omega6
        v1 ~~ omega7*v1          # Var(e7)  = omega7
        v2 ~~ omega8*v2          # Var(e8)  = omega8
        v3 ~~ omega9*v3          # Var(e9)  = omega9
        v4 ~~ omega10*v4         # Var(e10) = omega10
' # End of model torus6
fit6 = lavaan(torus6, data=coffee)
fit6
summary(fit6)
parTable(fit6)

# Likelihood ratio test of 
# H0: lambda2 = lambda4 = lambda6 = lambda8 = lambda10 = 1
anova(fit1,fit6)

# For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
source("http://www.utstat.utoronto.ca/~brunner/Rfunctions/Wtest.txt")
thetahat = coef(fit6); thetahat
LL = rbind(c(0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
           c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
           c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
           c(0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
           c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))
hh = c(1,1,1,1,1)
Wtest(LL,thetahat,vcov(fit6),hh)
options(scipen=999)
Wtest(LL,thetahat,vcov(fit6),hh)

# Which ones are different from one?

pt6 = parTable(fit6); dim(pt6)
z = as.numeric( (pt6[,14]-1)/pt6[,15] )
# Extract only meaningful z statistics (lambda_j)
z = z[c(7,9,11,13,15)]
names(z) = c('lambda2', 'lambda4', 'lambda6', 'lambda8', 'lambda10')
z
pt6[c(7,9,11,13,15),14] # Corresponding theta-hats


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 20. Principal Components on the Diversity Data
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx
# The replication data are in DiversityReplic.xlsx
ddata = read_excel("DiversityExplore.xlsx") # Read local copy
ddata = as.data.frame(ddata) # Instead of a "tibble"
dim(ddata); head(ddata)

quest = as.matrix(ddata[,2:41]); head(quest)
Sigma_hat = cor(quest); round(Sigma_hat,3)

eigenSigma = eigen(Sigma_hat); ls(eigenSigma)
eigenSigma$values

lambda_hat = eigenSigma$values
lambda_hat/40 # Proportions of explained variance
cumsum(lambda_hat/40) # Cumulative sum


# Principal Components
Z = scale(quest) # Standardize columns (This is a 500 x 40 matrix)
C_hat = eigenSigma$vectors 
Y_hat = Z %*% C_hat # Sample principal components
# Looking at the variance-covariance matrix of the principal components,
round(var(Y_hat), 4) # Should equal D

# I would say either keep just one, or keep 7
y = Y_hat[,1:7] # The first 7 components explain about 70% of the variance.
zy = cor(Z,y); round(zy,3)

# This is hard to look at. Sign of an eigenvector is arbitrary.

y = -Y_hat[,1:7] 
zy = cor(Z,y); round(zy,3)

# Now principal components the easy way
# help(prcomp) # Better than princomp
pc = prcomp(quest, scale = T)
ls(pc) # What's in pc?
    # center has the sample means before standardization
    # rotation is C-hat
    # scale has the standard deviations before standardization
    # sdev has estimated standard deviations of the components
    # x is a matrix of the principal components: Y-hat

pc$sdev^2 # Eigenvalues
lambda_hat # For comparison
dim(pc$x) # x is Y-hat

pc7 = prcomp(quest, scale = T, rank = 7) # Retain seven principal components
round(pc7$rotation, 4) # Just the first 7 columns of C-hat
head(pc7$x) # There should be 7 columns

round(cor(quest,pc7$x),4) # Correlations of variables with components

# Look at the a priori scale Commitment to the Organization
pcOrg = prcomp(quest[,1:10], scale = T) # First 10 questions
pcOrg$sdev^2  # Eigenvalues
round(cor(quest[,1:10],pcOrg$x[,1:3]),4) # Correlations of variables with components

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 22. Exploratory Factor Analysis with R
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
#           Diversity data
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# Exploratory Factor Analysis of the Diversity data
rm(list=ls())
# install.packages("readxl", dependencies = TRUE) # Only need to do this once
library(readxl)
# Download the data and put it in your working directory.
# https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx
# The replication data are in DiversityReplic.xlsx
ddata = read_excel("DiversityExplore.xlsx") # Read local copy
ddata = as.data.frame(ddata) # Instead of a "tibble"
dim(ddata); # head(ddata)

quest = as.matrix(ddata[,2:41]); head(quest)

# Check eigenvalues and scree plot
pc = prcomp(quest, scale = T)
# ls(pc) # What's in pc?
    # center has the sample means before standardization
    # rotation is C-hat
    # scale has the standard deviations before standardization
    # sdev has estimated standard deviations of the components
    # x is a matrix of the principal components: Y-hat

Eigenvalue = pc$sdev^2; Eigenvalue
# Seven eigenvalues greater than one

# Scree plot
plot(1:40,Eigenvalue, xlab = "Principal Component", type = "l", 
     main = "Scree Plot of the Diversity Data")

# Zoom in on the first ten
plot(1:10,Eigenvalue[1:10], type = "l", 
    xaxp = c(1,10,9), # Tick marks on x axis
    xlab = "Principal Component", ylab = "Eigenvalue",
     main = "Scree Plot of the Diversity Data")

# Scree plot suggests 6 factors.

# help(factanal)
fa1 = factanal(quest,factors=6) # rotation="varimax" is the default
fa1

# Loadings below 0.1 are blanked out by default
L1 = fa1$loadings; print(L1,cutoff=0.3, sort="True")
# Check SS loadings = 7.853 for Factor One
sum(L1[,1]^2)

# This is such a pretty picture. What if no rotation?
fa2 = factanal(quest,factors=6, rotation="none"); L2 = fa2$loadings
print(L2,cutoff=0.3, sort="True")

# Varimax is much nicer. 

ls(fa1)

# It's common to specify a rotation (or accept the default) in the initial fit, but one can also fit a model without rotation, and then rotate the factors as a separate step.


varimax(L2)
fa1$rotmat

varimaxL2 = varimax(L2); print(varimaxL2$loadings, cutoff=0.3, sort="True")

# Scales

# I am thinking that the a priori scales are pretty good. In fact, they are amazing.
    # Are they really uncorrelated?
    # What factor(s) underlie them?

# head(quest)

Com = apply(quest[,1:10],MARGIN=1, FUN=sum) # Commitment to Organization
RelC = apply(quest[,11:15],MARGIN=1, FUN=sum) # Relations with Colleagues
RelM = apply(quest[,16:27],MARGIN=1, FUN=sum) # Relations with Management
Fair = apply(quest[,28:33],MARGIN=1, FUN=sum) # Fair opportunities for advancement
Sat = apply(quest[,34:37],MARGIN=1, FUN=sum) # Job satisfaction
SMdiv = apply(quest[,38:40],MARGIN=1, FUN=sum) # Sr. Man commitment to diversity

# Checking
# head(cbind(quest[,1:10],Com))
# head(cbind(quest[,11:15],RelC))
# head(cbind(quest[,16:27],RelM))
# head(cbind(quest[,28:33],Fair))
# head(cbind(quest[,34:37],Sat))
# head(cbind(quest[,38:40],SM))

scaledat = cbind(Com, RelC, RelM, Fair, Sat, SMdiv)
head(scaledat); dim(scaledat)
corrmat = cor(scaledat); corrmat
# Under H0: rho = 0, t = r * sqrt(n-2) / sqrt(1-r^2) ~ t(n-2)
critval = qt(0.975,498); critval
# Calculate all the t statistics
n = 500; tmat = corrmat * sqrt(n-2) / sqrt(1-corrmat^2)
round(tmat,3)

# Factor analysis: How many factors?

pc2 = prcomp(scaledat, scale = T)
Eigenvalue = pc2$sdev^2; Eigenvalue

# Scree plot
plot(1:6,Eigenvalue, xlab = "Principal Component", type = "l", 
     main = "Scree Plot of the Diversity Scales")
# cor(scaledat,pc2$x) # Correlations of variables with principal components

# Either one factor or two

one = factanal(scaledat,factors=1); L.1 = one$loadings
print(one,cutoff=0.3, sort="True")

two = factanal(scaledat,factors=2)
print(two,cutoff=0.3, sort="True")

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
#           Simulated data
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

rm(list=ls())

# Factor analysis of simulated data
# Two orthogonal factors, normal distribution

rm(list=ls())
n = 50000 # Huge sample size
# True factor loadings have a simple structure like varimax (All communalities = 0.49)
# Factor loadings
L11 = 0.7; L12 = 0.0
L21 = 0.7; L22 = 0.0
L31 = 0.7; L32 = 0.0
L41 = 0.7; L42 = 0.0
L51 = 0.0; L52 = 0.7
L61 = 0.0; L62 = 0.7
L71 = 0.0; L72 = 0.7
L81 = 0.0; L82 = 0.7
# Error Variances
v1 = 1 - L11**2 - L12**2
v2 = 1 - L21**2 - L22**2
v3 = 1 - L31**2 - L32**2
v4 = 1 - L41**2 - L42**2
v5 = 1 - L51**2 - L52**2
v6 = 1 - L61**2 - L62**2
v7 = 1 - L71**2 - L72**2
v8 = 1 - L81**2 - L82**2
# Generate data
set.seed(9999)
F1 = rnorm(n,0,1); F2 = rnorm(n,0,1)
d1 = L11*F1 + L12*F2 + rnorm(n,0,sqrt(v1))
d2 = L21*F1 + L22*F2 + rnorm(n,0,sqrt(v2))
d3 = L31*F1 + L32*F2 + rnorm(n,0,sqrt(v3))
d4 = L41*F1 + L42*F2 + rnorm(n,0,sqrt(v4))
d5 = L51*F1 + L52*F2 + rnorm(n,0,sqrt(v5))
d6 = L61*F1 + L62*F2 + rnorm(n,0,sqrt(v6))
d7 = L71*F1 + L72*F2 + rnorm(n,0,sqrt(v7))
d8 = L81*F1 + L82*F2 + rnorm(n,0,sqrt(v8))
dmat = cbind(d1,d2,d3,d4,d5,d6,d7,d8)

factanal(dmat,factors=2,rotation='varimax')

# The truth again for comparison
Lambda = rbind(c(L11,L12),  
               c(L21,L22), 
               c(L31,L32), 
               c(L41,L42), 
               c(L51,L52), 
               c(L61,L62), 
               c(L71,L72), 
               c(L81,L82)  )
Lambda

# Example 2: Truth is not like varimax (All communalities = 0.50)
# Factor loadings
L11 = 0.5; L12 = -0.5
L21 = 0.5; L22 = -0.5
L31 = 0.5; L32 = -0.5
L41 = 0.5; L42 = -0.5
L51 = 0.5; L52 =  0.5
L61 = 0.5; L62 =  0.5
L71 = 0.5; L72 =  0.5
L81 = 0.5; L82 =  0.5
# Error Variances
v1 = 1 - L11**2 - L12**2
v2 = 1 - L21**2 - L22**2
v3 = 1 - L31**2 - L32**2
v4 = 1 - L41**2 - L42**2
v5 = 1 - L51**2 - L52**2
v6 = 1 - L61**2 - L62**2
v7 = 1 - L71**2 - L72**2
v8 = 1 - L81**2 - L82**2
# Generate data
set.seed(8888)
F1 = rnorm(n,0,1); F2 = rnorm(n,0,1)
d1 = L11*F1 + L12*F2 + rnorm(n,0,sqrt(v1))
d2 = L21*F1 + L22*F2 + rnorm(n,0,sqrt(v2))
d3 = L31*F1 + L32*F2 + rnorm(n,0,sqrt(v3))
d4 = L41*F1 + L42*F2 + rnorm(n,0,sqrt(v4))
d5 = L51*F1 + L52*F2 + rnorm(n,0,sqrt(v5))
d6 = L61*F1 + L62*F2 + rnorm(n,0,sqrt(v6))
d7 = L71*F1 + L72*F2 + rnorm(n,0,sqrt(v7))
d8 = L81*F1 + L82*F2 + rnorm(n,0,sqrt(v8))
dmat = cbind(d1,d2,d3,d4,d5,d6,d7,d8)

notsimple = factanal(dmat,factors=2,rotation='varimax'); notsimple

# Is there a possible rotation that would get us close to the truth?
# Procrustes rotation
# install.packages("MCMCpack", dependencies=TRUE) # Only need to do this once
library(MCMCpack)
# help(procrustes)
# Estimated factor loadings for the second example
L = notsimple$loadings; print(L,cutoff=0) 
Lambda = rbind(c(L11,L12),                # True factor loadings
               c(L21,L22), 
               c(L31,L32), 
               c(L41,L42), 
               c(L51,L52), 
               c(L61,L62), 
               c(L71,L72), 
               c(L81,L82)  )
Lambda # True Lambda -- How close can we get to this?
# Rotate X to approximate Xstar. It's orthogonal: X R ~ Xstar
pro = procrustes(X = L, Xstar = Lambda) 
round(pro$X.new,2)

round( L %*% pro$R , 2) # Rotating "manually"
pro$R %*% t(pro$R) # R is an orthogonal matrix

# Now try a Procrustes rotation of something unrelated to the truth
M  = rbind(c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64),
           c(0.30,0.64)  ); M
procrustes(X = L, Xstar = M)$X.new

round(pro$X.new,2) # Repeating for comparison


# The truth is very close to a member of the set of factor matrices that can be reached by an orthogonal rotation.  The problem is that we don't know which one.

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 24. Rotating Principal Components with R: Diversity data
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

rm(list=ls())
# install.packages("readxl", dependencies = TRUE) # Only need to do this once
library(readxl)
# Download the data and put it in your working directory.
# https://www.utstat.toronto.edu/brunner/data/legal/DiversityExplore.xlsx
# The replication data are in DiversityReplic.xlsx
ddata = read_excel("DiversityExplore.xlsx") # Read local copy
ddata = as.data.frame(ddata) # Instead of a "tibble"
dim(ddata); head(ddata)
# Saved DiversityWorkspace file

quest = as.matrix(ddata[,2:41])
pc7 = prcomp(quest, scale = T, rank = 7) # Retain seven principal components
ls(pc7)
# The seven principal components are in pc7$x
L = cor(quest,pc7$x) # It really should be Lhat
# round(L,4)  
vm7 = varimax(L); L2 = vm7$loadings

print(L2,cutoff=0.3)
sum(L2[,3]^2)    # Amount of variance explained by component three.
sum(L2[,3]^2)/40 # Proportion of variance explained by component three.

# If we had the questionnaire, we might be able to understand PC7
# It explains only 3% of the variance. Try dropping it.

pc6 = prcomp(quest, scale = T, rank = 6)
L = cor(quest,pc6$x)
vm6 = varimax(L); L2 = vm6$loadings
print(L2,cutoff=0.3)

# It's always okay to reverse signs in a column.
# It just reverses the meaning of the component.
# Reverse columns 2, 4 and 5, and then name the components.
L2[,c(2,4,5)] = -L2[,c(2,4,5)]
colnames(L2) = c("RelMan", "CommitOrg", "RelCol", "SMCD", "JobSat", "FairOp")
print(L2,cutoff=0.3)

# Produce rotated components for future use

# Display equations here. LateX is
# \begin{eqnarray*}
#     \mathbf{z} & = & \mathbf{Lf} + \mathbf{e} \\ 
#     & = &  \mathbf{L} \, \mathbf{R}^\top\mathbf{R} \, \mathbf{f} + \mathbf{e}  \\
#     & = &  (\mathbf{L} \, \mathbf{R}^\top) (\mathbf{R} \, \mathbf{f}) + \mathbf{e}  \\ 
#     & = &  \mathbf{L}_2 \mathbf{f}^\prime + \mathbf{e}
# \end{eqnarray*} % 24 pt

n = dim(quest)[1]
f = scale(pc6$x) * sqrt(n/(n-1)) # Divide by n, not n-1
fprime = f %*% vm6$rotmat
fprime[,c(2,4,5)] = -fprime[,c(2,4,5)]
colnames(fprime) = colnames(L2)
# Verify L2 = cor(data, fprime)
round(cor(quest,fprime),3)

# Now use the components
data.frame(1:47,colnames(ddata))

reduced_data = cbind(ddata[,42:47], fprime)
head(reduced_data)
summary(reduced_data)

# The demographic variables are character. Fix.
# Mostly make variables numeric indicators
reduced_data = within(reduced_data, 
    {
    Gender = as.numeric(Gender)
    VisMinority  = as.numeric(VisMinority)
    EDUCLevel  = as.numeric(EDUCLevel)
    # Make MaritalStatus a factor, married = 2 is reference category
    MaritalStatus  = factor(MaritalStatus)
    contrasts(MaritalStatus) = contr.treatment(4, base = 2) 
    Age  = as.numeric(Age)
    CAN_Foreign_Born = as.numeric(CAN_Foreign_Born)
    })

table(reduced_data$MaritalStatus, useNA = "ifany")
# Codes are 1 = Never married, 2 = Married, 3 = Divorced or separated,
#           4 = Widowed

head(reduced_data)

# Quick look at rotated PCs by binary demographics
democor = cor(reduced_data[,c(1:3,5,6)], reduced_data[,7:12], use = "pairwise.complete.obs")
round(democor,3)

# Eliminate all data with any missing values.
noNAs = na.omit(reduced_data); dim(noNAs)
r = cor( noNAs[,c(1:3,5,6)], noNAs[,7:12] )
round(r , 3)

# Test: Are those correlations non-zero?
n = dim(noNAs)[1]
ttable = r * sqrt(n-2) / sqrt(1-r^2)
round(ttable,3)
critval = qt(0.975,n-2); critval

# Bonferroni correcting for 30 tests: Divide alpha by 30
a = 0.05/30
Ccritval = qt(1-a/2,n-2); Ccritval

# Visible minority and job satisfaction: What is the actual mean difference?
# Had r = -0.168, t = -3.553
t.test(JobSat ~ VisMinority, var.equal = TRUE, data = reduced_data)

# What does a difference of 0.36 SDs look like?
z = seq(-3,3,length = 100)
Density = dnorm(z, mean=0.108); d2 = dnorm(z, mean = -0.251)
plot(z,Density, type = "l") # That's an L
lines(z,d2,lty = 1); title("Difference of 0.36 SD Units")


# One regression
satisf = lm(JobSat ~ Gender + VisMinority + EDUCLevel + 
                      MaritalStatus + Age + CAN_Foreign_Born, data = reduced_data)
summary(satisf)

# Well, one more
sat2 = update(satisf, . ~ . - CAN_Foreign_Born) # Eliminate the Americans
summary(sat2)


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 27. Confirmatory Factor Analysis with R: Body-Mind data
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# CFA of Body-mind data

# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

rm(list=ls())
bodymind = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bodymind.data.txt")
bodymind = read.table("bodymind.data.txt") # Local copy
head(bodymind)

# Need a numeric dummy variable for sex/gender
n = dim(bodymind)[1]; n
bodymind = within(bodymind,{
    Sex = numeric(n)
    Sex[sex == "F"] = 1 # Makes 0=M, 1=F
}) # End within bodymind
head(bodymind)

# Observe that the parameters of the sub-model with Sex and size are identifiable.

# In mod1, name all the parameters and use the lavaan function.
mod1 = "########################################################################
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda4*progmat + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                  size ~~ phi22*size + phi23*smart
                          smart ~~ phi33*smart
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
        ########################################################################
        " # End of mod1

fit1 = lavaan(mod1, data=bodymind)
varTable(fit1)

# It may be okay, but re-express height in metres
bodymind = within(bodymind,{height = height/10}) 
fit1 = lavaan(mod1, data=bodymind)
summary(fit1)

# Variance of Sex is really p(1-p) -- Bernoulli. 
# Estimates are well behaved. I think it's okay.

# By the way, the model does not fit. This is an unpleasant surprise, but these are real data.

# Fit the model more easily with cfa. Sex is included in the last line.
mod2 = "size  =~ height + weight
        smart =~ verbal + progmat + reason
        Sex ~~ size + smart" # Last line is unusual.
        
fit2 = cfa(mod2, data=bodymind)
summary(fit2)

# Parameter estimates match perfectly.

# "Standardized parameter estimates"  (Adds two columns ...)
summary(fit2, standardized=TRUE)

# Note: Calculation of estimates for the standardized models is described in the text.

# The only problem with Std.lv and Std.all is that you don't get standard errors. 

BMdat = bodymind[,c(2:4, 9:11)]
kor = cor(BMdat); round(kor,3)

fit3 = cfa(mod2, sample.cov=kor, sample.nobs=80, std.lv=TRUE, sample.cov.rescale=FALSE)
summary(fit3, standardized=TRUE)
parameterEstimates(fit3)

# Perfect match, no complaints
# Note how std.lv=TRUE wisely un-did the setting of factor loadings to one.

# Why is the model not fitting?
# Look at the correlation matrix again.
round(kor,3)

# Residuals are model-implied covariances (correlations), 
# minus observed covariances (correlations).

lavResiduals(fit2, se=TRUE) # cor by default, se is optional
fitted(fit2) # Sigma(thetahat)

# How about an additional link between Sex and progmat?

# Parameters are identifiable ...


mod3 = "size  =~ height + weight
        smart =~ verbal + progmat + reason
        Sex ~~ size + smart + progmat"
fit4 = cfa(mod3, data=bodymind)
summary(fit4)
# That was ugly and strange. Try fuller model specification

mod4 = "########################################################################
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda4*progmat + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                   size ~~ phi22*size + phi23*smart
                               smart ~~ phi33*smart
        Sex ~~ c*progmat # Covariance with error term
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
        ########################################################################
        " # End of mod4
fit4 = lavaan(mod4, data=bodymind)
# Tried other things ...

# Post to the group and display that

# Make the link from smart to progmat a "regression."

mod5 = "########################################################################
        # Regressions
        # -------------
        progmat ~ lambda4 * smart
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                   size ~~ phi22*size + phi23*smart
                               smart ~~ phi33*smart
        Sex ~~ c*progmat # Covariance with error term -- an epsilon this time.
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
        ########################################################################
        " # End of mod5
fit5 = lavaan(mod5, data=bodymind)
summary(fit5)

# Add functions for the CORRELATIONS between factors

mod6 = "########################################################################
        # Regressions
        # -------------
        progmat ~ lambda4 * smart
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                   size ~~ phi22*size + phi23*smart
                               smart ~~ phi33*smart
        Sex ~~ c*progmat # Covariance with error term -- an epsilon this time.
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
        # Correlations between factors
        # ----------------------------
        corr12 := phi12/sqrt(phi11*phi22)
        corr13 := phi13/sqrt(phi11*phi33)
        corr23 := phi23/sqrt(phi22*phi33)
        ########################################################################
        " # End of mod6
fit6 = lavaan(mod6, data=bodymind)
summary(fit6, standardized=TRUE)
# parameterEstimates(fit6)

# Bring in the head size variables. No need for symbols. 

mod7 = "########################################################################
        # Regressions
        # -------------
        progmat ~ lambda4 * smart
            headlng ~ Sex + size + smart
            headbrd ~ Sex + size + smart
            headcir ~ Sex + size + smart
            bizyg   ~ Sex + size + smart
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                   size ~~ phi22*size + phi23*smart
                               smart ~~ phi33*smart
        Sex ~~ c*progmat # Covariance with error term -- an epsilon this time.
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
            # Variances and covariances between error terms of head variables
            # ---------------------------------------------------------------
            headlng ~~ headlng + headbrd + headcir + bizyg
                       headbrd ~~ headbrd + headcir + bizyg
                                  headcir ~~ headcir + bizyg
                                             bizyg ~~ bizyg
        # Correlations between factors
        # ----------------------------
        corr12 := phi12/sqrt(phi11*phi22)
        corr13 := phi13/sqrt(phi11*phi33)
        corr23 := phi23/sqrt(phi22*phi33)
        ########################################################################
        " # End of mod7
fit8 = lavaan(mod7, data=bodymind); summary(fit8)
parameterEstimates(fit8)

# fit8b = lavaan(mod7, se="bootstrap", data=bodymind)
# parameterEstimates(fit8b)
# parameterEstimates(fit8) # For comparison -- no bootstrap


# I am not really comfortable with arrows coming from Sex.

mod8 = "########################################################################
        # Regressions
        # -------------
        progmat ~ lambda4 * smart
        headlng ~ size + smart
        headbrd ~ size + smart
        headcir ~ size + smart
        bizyg   ~ size + smart
        # Measurement model
        # -----------------
        size  =~ 1.0*height + lambda2*weight
        smart =~ 1.0*verbal + lambda5*reason
        # Variances and covariances
        # -------------------------
        Sex ~~ phi11*Sex + phi12*size + phi13*smart
                   size ~~ phi22*size + phi23*smart
                               smart ~~ phi33*smart
        Sex ~~ c*progmat # Covariance with error term -- an epsilon this time.
        height  ~~ omega1*height    # Var(e1) = omega1
        weight  ~~ omega2*weight    # Var(e2) = omega2
        verbal  ~~ omega3*verbal    # Var(e3) = omega3
        progmat ~~ omega4*progmat   # Var(e4) = omega4
        reason  ~~ omega5*reason    # Var(e5) = omega5
        # Variances and covariances between error terms of head variables
        # ---------------------------------------------------------------
        headlng ~~ headlng + headbrd + headcir + bizyg
                   headbrd ~~ headbrd + headcir + bizyg
                              headcir ~~ headcir + bizyg
                                         bizyg ~~ bizyg
        # Correlations between factors
        # ----------------------------
        corr12 := phi12/sqrt(phi11*phi22)
        corr13 := phi13/sqrt(phi11*phi33)
        corr23 := phi23/sqrt(phi22*phi33)
        ########################################################################
        " # End of mod8

fit9 = lavaan(mod8, data=bodymind); summary(fit9)


anova(fit8,fit9) # Should be 4 df

# Sex is not normally distributed. Try bootstrap.

fit10 = lavaan(mod7, data=bodymind, se="bootstrap"); # summary(fit10)
parameterEstimates(fit10)

parameterEstimates(fit8) # For comparison


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 30. Multi-Group SEM with R

# Groups

# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

rm(list=ls())
bodymind = read.table("http://www.utstat.toronto.edu/~brunner/openSEM/data/bodymind.data.txt")
# bodymind = read.table("bodymind.data.txt") # Local copy
head(bodymind)

# Basic two-factor model
mod1 = "Size  =~ height + weight
        Smart =~ verbal + progmat + reason"

fit0 = cfa(mod1, data=bodymind); show(fit0)

# Include sex as a grouping factor. 
# By default, the same model is fitted in all groups.
# Different parameter estimates for M and F
fit1 = cfa(mod1, data=bodymind, group = "sex")
summary(fit1)

# Notice that intercepts are included. Model fit should not be affected.
# Also notice Size ~~ Smart is significant for F but not M.

# Test equal fit. Disregard intercepts and just set the following equal across groups:

# lambda2, lambda4, lambda5, phi11, phi12, phi22, omega1, ..., omega5
# There are eleven parameters, should be 11 df.

# Parameters can be set equal by giving them the same names, but this is more convenient.
fit2 = cfa(mod1, data=bodymind, group = "sex",
       group.equal = c("loadings", "residuals", "lv.variances", "lv.covariances"))
summary(fit2)

anova(fit1,fit2)

# Data are more likely when the model is less restricted, so fit1-fit2
2*(logLik(fit1) - logLik(fit2) )

# You might think the G^2 test of fit for fit0 and fit2 should be the same, but it can't be. Fit0 does not know about the separate sample covariance matrices for M and F.

# Test difference in just Size ~~ Smart

mod2 = "Size  =~ height + weight
        Smart =~ verbal + progmat + reason
        Size ~~ c(phi12,phi12)*Smart" # Vector of parameter names
        
fit3 = cfa(mod2, data=bodymind, group = "sex")
summary(fit3)

anova(fit1,fit3)

# There are also tricks for setting a lot of parameters equal across groups, and then freeing some of them, allowing them to be different. A google search for lavaan tutorial takes you to the right document.

# Try radically different models for M and F, just as an experiment.

Twomods = 
"
group: 1
progmat ~ verbal + theta*reason 

group: 2
Size  =~ height + weight
Smart =~ verbal + progmat + theta*reason
"

fit4 = cfa(Twomods, data=bodymind, group = "sex")
summary(fit4)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# 31. Analysis of Unstructured Covariance Matrices with R

rm(list=ls())
# install.packages("lavaan", dependencies = TRUE) # Only need to do this once
library(lavaan)

longIQ = read.table("https://www.utstat.toronto.edu/brunner/data/illegal/origIQ.data.txt")
colnames(longIQ) = c("amEduc", "bmIQ", "IQ2", "IQ4", "IQ8", "IQ13")
head(longIQ)
dim(longIQ)
# summary(longIQ)

r = round(cor(longIQ), 3); r
# Under H0: rho = 0, t = r * sqrt(n-2) / sqrt(1-r^2) ~ t(n-2)
n = dim(longIQ)[1]; n
critval = qt(0.975,n-2); critval
tstat = r * sqrt(n-2) / sqrt(1-r^2); round(tstat,3)

# In this model, the only parameters are variances and covariances
# of the observable variables.
covmod0 = "amEduc ~~ amEduc + bmIQ  + IQ2 +  IQ4 +  IQ8 + IQ13
                      bmIQ ~~ bmIQ  + IQ2 +  IQ4 +  IQ8 + IQ13
                               IQ2 ~~ IQ2 +  IQ4 +  IQ8 + IQ13
                                      IQ4 ~~ IQ4 +  IQ8 + IQ13
                                             IQ8 ~~ IQ8 + IQ13 
                                                  IQ13 ~~ IQ13"
fit0 = lavaan(covmod0,data=longIQ)
summary(fit0)

# Okay, that's kind of reasonable. Try to make them correlations.

fit0.1 = lavaan(covmod0, data=longIQ, std.ov=TRUE)
# summary(fit0.1)

fitted(fit0.1) # Sigma(thetahat)
r # For comparison
fitted(fit0.1)$cov * n/(n-1)

# So asymptotically it's fine, but I want the MLEs to be sample correlations.
# Standardize the variables myself.

standIQ = scale(longIQ) * sqrt(n/(n-1))
fit0.2 = lavaan(covmod0, data=standIQ)
fitted(fit0.2); r # Sigma(thetahat) should be correlation matrix

# Now want names for parameters so I can test differences, etc.

covmod1 = 
"amEduc ~~ amEduc + rho12*bmIQ + rho13*IQ2 + rho14*IQ4 + rho15*IQ8 + rho16*IQ13
                  bmIQ ~~ bmIQ + rho23*IQ2 + rho24*IQ4 + rho25*IQ8 + rho26*IQ13
                                IQ2 ~~ IQ2 + rho34*IQ4 + rho35*IQ8 + rho36*IQ13
                                            IQ4 ~~ IQ4 + rho45*IQ8 + rho46*IQ13
                                                        IQ8 ~~ IQ8 + rho56*IQ13 
                                                                   IQ13 ~~ IQ13"
fit1 = lavaan(covmod1, data=standIQ) # This will be the full model.
parameterEstimates(fit1)
# fitted(fit1); r # Checks

# Likelihood ratio test of relationship between adoptive mom's education 
# and kid's IQ at any age
noAM = lavaan(covmod1, data=standIQ, constraints = "rho13==0
                                                    rho14==0
                                                    rho15==0
                                                    rho16==0")
anova(noAM,fit1)

# Likelihood ratio test of relationship between birth mom's IQ 
# and kid's IQ at any age

noBM = lavaan(covmod1, data=standIQ, constraints = "rho23==0
                                                    rho24==0
                                                    rho25==0
                                                    rho26==0")
anova(noBM,fit1)
# Multivariate regression yields p = 0.022

# ntheory = lm(cbind(IQ2, IQ4, IQ8, IQ13) ~ bmIQ, data = longIQ)
# summary(ntheory) # t statistics agree with matrix tstat
# anova(ntheory)

# Likelihood ratio test: Is birth mother's IQ correlated with child's IQ 
# differently at different ages?

BMsame = lavaan(covmod1, data=standIQ, constraints = "rho23==rho24
                                                      rho24==rho25
                                                      rho25==rho26")
anova(BMsame,fit1)

# This is surprising. Try a Wald test
# For Wald tests: Wtest = function(L,Tn,Vn,h=0) # H0: L theta = h
source("https://www.utstat.toronto.edu/brunner/openSEM/fun/Wtest.txt")

Vhat = vcov(fit1) # Asymptotic covariance matrix of the estimates
rhohat = coef(fit1); rhohat
length(rhohat)
rbind(names(rhohat))

L0 = matrix(0,3,21)
L0[1,8]  = 1;  L0[1,9]  = -1 # rho23==rho24
L0[2,9]  = 1;  L0[2,10] = -1 # rho24==rho25
L0[3,10] = 1;  L0[3,11] = -1 # rho25==rho26

Wtest(L0,rhohat,Vhat)

# Just test 2 years old versus 13

Two_vs_13 = matrix(0,1,21); Two_vs_13[1,8] = 1; Two_vs_13[1,11] = -1
Wtest(Two_vs_13,rhohat,Vhat)

# However, Bonferroni correcting for 6 pairwise tests, need p < ...
0.05/6

# One could bootstrap, but IQ scores are well known to be normally distributed.


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #