Confirmatory Factor Analysis
This mostly follows Bollen (1989) for maximum likelihood estimation of a confirmatory factor analysis. In the following example we will examine a situation where there are two underlying (correlated) latent variables for 8 observed responses. The code as is will only work with this toy data set. Setup uses the psych and mvtnorm packages, and results are checked against the lavaan package.
Data Setup
For the data we will simulate observed variables with specific loadings on two latent constructs (factors).
library(tidyverse)
set.seed(123)
# loading matrix
= matrix(
lambda c(1.0, 0.5, 0.8, 0.6, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 1.0, 0.7, 0.6, 0.8),
nrow = 2,
byrow = TRUE
)
# correlation of factors
= matrix(c(1, .25, .25, 1), nrow = 2, byrow = TRUE)
phi
# factors and some noise
= mvtnorm::rmvnorm(1000, mean = rep(0, 2), sigma = phi, "chol")
factors = mvtnorm::rmvnorm(1000, sigma = diag(8))
e
# observed responses
= 0 + factors%*%lambda + e
y
# Examine
#dim(y)
::describe(y) psych
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 1000 0.05 1.44 0.05 0.05 1.42 -5.13 4.51 9.63 0.00 0.01 0.05
X2 2 1000 0.00 1.08 -0.01 0.00 1.04 -3.34 3.25 6.59 0.00 -0.06 0.03
X3 3 1000 0.01 1.27 0.05 0.01 1.17 -5.33 4.09 9.42 -0.06 0.32 0.04
X4 4 1000 0.00 1.14 -0.03 -0.01 1.13 -3.85 3.98 7.83 0.10 0.16 0.04
X5 5 1000 0.04 1.43 0.10 0.05 1.39 -4.43 5.21 9.63 -0.02 0.07 0.05
X6 6 1000 -0.02 1.22 -0.01 -0.02 1.27 -3.35 4.68 8.03 0.04 -0.10 0.04
X7 7 1000 0.02 1.14 0.02 0.01 1.10 -3.08 3.66 6.74 0.11 0.00 0.04
X8 8 1000 0.01 1.29 0.02 0.01 1.24 -3.68 4.50 8.18 -0.02 0.10 0.04
# round(cor(y), 2)
# see the factor structure
::cor.plot(cor(y)) psych
# example exploratory fa
#psych::fa(y, nfactors=2, rotate="oblimin")
Functions
We will have two separate estimation functions, one for the covariance matrix, and another for the correlation matrix.
# measurement model, covariance approach
# trace function, strangely absent from base R
<- function(mat) {
tr sum(diag(mat), na.rm = TRUE)
}
<- function (parms, data) {
cfa_cov # Arguments-
# parms: initial values (named)
# data: raw data
# Extract parameters by name
= c(1, parms[grep('l1', names(parms))]) # loadings for factor 1
l1 = c(1, parms[grep('l2', names(parms))]) # loadings for factor 2
l2
= parms[grep('cov', names(parms))] # factor covariance, variances
cov0
# Covariance matrix
= cov(data)*((nrow(data)-1)/nrow(data)) # ML covariance div by N rather than N-1, the multiplier adjusts
S
# loading estimates
= cbind(
lambda c(l1, rep(0,length(l2))),
c(rep(0,length(l1)), l2)
)
# disturbances
= parms[grep('dist', names(parms))]
dist_init = diag(dist_init)
disturbs
# factor correlation
= matrix(c(cov0[1], cov0[2], cov0[2], cov0[3]), 2, 2) #factor cov/correlation matrix
phi_init
# other calculations and log likelihood
= lambda%*%phi_init%*%t(lambda) + disturbs
sigtheta
# in Bollen p + q (but for the purposes of this just p) = tr(data)
= dim(data)[2]
pq
# a reduced version; Bollen 1989 p.107
# ll = -(log(det(sigtheta)) + tr(S%*%solve(sigtheta)) - log(det(S)) - pq)
# this should be the same as Mplus H0 log likelihood
= ( (-nrow(data)*pq/2) * log(2*pi) ) -
ll nrow(data)/2) * ( log(det(sigtheta)) + tr(S%*%solve(sigtheta)) )
(
-ll
}
We can use the correlation matrix for standardized results. Lines correspond to those in cfa_cov
.
<- function (parms, data) {
cfa_cor
= parms[grep('l1', names(parms))] # loadings for factor 1
l1 = parms[grep('l2', names(parms))] # loadings for factor 2
l2 = parms[grep('cor', names(parms))] # factor correlation
cor0
= cor(data)
S
= cbind(
lambda c(l1, rep(0,length(l2))),
c(rep(0,length(l1)), l2)
)
= parms[grep('dist', names(parms))]
dist_init = diag(dist_init)
disturbs
= matrix(c(1, cor0, cor0, 1), ncol=2)
phi_init
= lambda%*%phi_init%*%t(lambda) + disturbs
sigtheta = dim(data)[2]
pq
#ll = ( log(det(sigtheta)) + tr(S%*%solve(sigtheta)) - log(det(S)) - pq )
= ( (-nrow(data)*pq/2) * log(2*pi) ) -
ll nrow(data)/2) * ( log(det(sigtheta)) + tr(S%*%solve(sigtheta)) )
(
-ll
}
Estimation
Corresponding to the functions, we will get results based on the covariance and correlation matrix respectively.
Raw
Set initial values.
= c(rep(1, 6), rep(.05, 8), rep(.5, 3))
par_init_cov names(par_init_cov) = rep(c('l1','l2', 'dist', 'cov'), c(3, 3, 8, 3))
Estimate and extract.
= optim(
fit_cov par = par_init_cov,
fn = cfa_cov,
data = y,
method = "L-BFGS-B",
lower = 0
)
= data.frame(
loadings_cov f1 = c(1, fit_cov$par[1:3], rep(0, 4)),
f2 = c(rep(0, 4), 1, fit_cov$par[4:6])
)
= fit_cov$par[7:14] disturbances_cov
Standardized
= c(rep(1, 8), rep(.05, 8), 0) #for cor
par_init_cor names(par_init_cor) = rep(c('l1', 'l2', 'dist', 'cor'), c(4, 4, 8, 1))
= optim(
fit_cor par = par_init_cor,
fn = cfa_cor,
data = y,
method = "L-BFGS-B",
lower = 0,
upper = 1
)
= matrix(
loadings_cor c(fit_cor$par[1:4], rep(0, 4), rep(0, 4), fit_cor$par[5:8]),
ncol = 2
)
= fit_cor$par[9:16] disturbances_cor
Comparison
Gather results for summarizing.
= list(
results raw = list(
loadings = round(data.frame(loadings_cov, Variances = disturbances_cov), 3),
cov.fact = round(matrix(c(fit_cov$par[c(15, 16, 16, 17)]), ncol = 2) , 3)
),
standardized = list(
loadings = round(
data.frame(
loadings_cor,Variances = disturbances_cor,
Rsq = (1 - disturbances_cor)
3),
), cor.fact = round(matrix(c(1, fit_cor$par[c(17, 17)], 1), ncol = 2), 3)
),
# note inclusion of intercepts for total number of par
fit_lav = data.frame(
ll = fit_cov$value,
AIC = 2*fit_cov$value + 2 * (length(par_init_cov) + ncol(y)),
BIC = 2*fit_cov$value + log(nrow(y)) * (length(par_init_cov) + ncol(y))
)
)
results
$raw
$raw$loadings
f1 f2 Variances
1 1.000 0.000 1.073
2 0.459 0.000 0.955
3 0.836 0.000 0.908
4 0.570 0.000 0.961
5 0.000 1.000 1.047
6 0.000 0.739 0.941
7 0.000 0.575 0.972
8 0.000 0.803 1.034
$raw$cov.fact
[,1] [,2]
[1,] 1.006 0.185
[2,] 0.185 0.989
$standardized
$standardized$loadings
X1 X2 Variances Rsq
1 0.696 0.000 0.516 0.484
2 0.426 0.000 0.819 0.181
3 0.661 0.000 0.563 0.437
4 0.504 0.000 0.746 0.254
5 0.000 0.697 0.514 0.486
6 0.000 0.604 0.636 0.364
7 0.000 0.502 0.748 0.252
8 0.000 0.618 0.618 0.382
$standardized$cor.fact
[,1] [,2]
[1,] 1.000 0.186
[2,] 0.186 1.000
$fit_lav
ll AIC BIC
1 12497.68 25045.37 25168.06
Compare with lavaan.
library(lavaan)
= data.frame(y)
y
= '
model F1 =~ X1 + X2 + X3 + X4
F2 =~ X5 + X6 + X7 + X8
'
= cfa(
fit_lav
model,data = y,
mimic = 'Mplus',
estimator = 'ML'
)
= cfa(
fit_lav_std
model,data = y,
mimic = 'Mplus',
estimator = 'ML',
std.lv = TRUE,
std.ov = TRUE
)
# note that lavaan does not count the intercepts among the free params for
# AIC/BIC by default, (can get its result via -2 * as.numeric(lls) + k *
# attr(lls, "df")), but the mimic='Mplus' should have them correspond to optim's
# results
summary(fit_lav,
fit.measures = TRUE,
standardized = TRUE)
lavaan 0.6-7 ended normally after 30 iterations
Estimator ML
Optimization method NLMINB
Number of free parameters 25
Number of observations 1000
Number of missing patterns 1
Model Test User Model:
Test statistic 25.586
Degrees of freedom 19
P-value (Chi-square) 0.142
Model Test Baseline Model:
Test statistic 1229.322
Degrees of freedom 28
P-value 0.000
User Model versus Baseline Model:
Comparative Fit Index (CFI) 0.995
Tucker-Lewis Index (TLI) 0.992
Loglikelihood and Information Criteria:
Loglikelihood user model (H0) -12497.683
Loglikelihood unrestricted model (H1) -12484.890
Akaike (AIC) 25045.366
Bayesian (BIC) 25168.060
Sample-size adjusted Bayesian (BIC) 25088.658
Root Mean Square Error of Approximation:
RMSEA 0.019
90 Percent confidence interval - lower 0.000
90 Percent confidence interval - upper 0.035
P-value RMSEA <= 0.05 1.000
Standardized Root Mean Square Residual:
SRMR 0.018
Parameter Estimates:
Standard errors Standard
Information Observed
Observed information based on Hessian
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
F1 =~
X1 1.000 1.003 0.696
X2 0.459 0.046 10.043 0.000 0.460 0.426
X3 0.836 0.066 12.590 0.000 0.839 0.661
X4 0.570 0.050 11.450 0.000 0.572 0.504
F2 =~
X5 1.000 0.994 0.697
X6 0.739 0.056 13.239 0.000 0.735 0.604
X7 0.575 0.048 12.071 0.000 0.572 0.502
X8 0.803 0.060 13.386 0.000 0.799 0.618
Covariances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
F1 ~~
F2 0.185 0.046 4.014 0.000 0.186 0.186
Intercepts:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.X1 0.054 0.046 1.173 0.241 0.054 0.037
.X2 -0.004 0.034 -0.104 0.917 -0.004 -0.003
.X3 0.007 0.040 0.182 0.855 0.007 0.006
.X4 0.004 0.036 0.113 0.910 0.004 0.004
.X5 0.044 0.045 0.964 0.335 0.044 0.031
.X6 -0.019 0.038 -0.504 0.614 -0.019 -0.016
.X7 0.024 0.036 0.674 0.500 0.024 0.021
.X8 0.010 0.041 0.247 0.805 0.010 0.008
F1 0.000 0.000 0.000
F2 0.000 0.000 0.000
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.X1 1.073 0.087 12.356 0.000 1.073 0.516
.X2 0.955 0.047 20.128 0.000 0.955 0.819
.X3 0.908 0.065 13.887 0.000 0.908 0.563
.X4 0.961 0.051 18.885 0.000 0.961 0.746
.X5 1.048 0.079 13.336 0.000 1.048 0.514
.X6 0.941 0.056 16.867 0.000 0.941 0.636
.X7 0.972 0.050 19.255 0.000 0.972 0.748
.X8 1.034 0.063 16.411 0.000 1.034 0.618
F1 1.006 0.108 9.351 0.000 1.000 1.000
F2 0.989 0.100 9.853 0.000 1.000 1.000
Mplus
If you have access to Mplus you can use Mplus Automation to prepare the data. The following code is in Mplus syntax and will produce the above model.
library(MplusAutomation)
prepareMplusData(data.frame(y), "factsim.dat")
MODEL:
F1 BY X1-X4;
F2 BY X5-X8;
results:
STDYX;
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/cfa.R