Expectation-Maximization
Mixture Model
The following code is based on algorithms noted in Murphy, 2012 Probabilistic Machine Learning, specifically, Chapter 11, section 4.
Data Setup
This example uses Old Faithful geyser eruptions. This is only a univariate mixture for either eruption time or wait time. The next example will be doing both variables, i.e. multivariate normal. ‘Geyser’ is supposedly more accurate, though seems to have arbitrarily assigned some duration values. See this source also, but that only has intervals. Some July 1995 data is available.
library(tidyverse)
# faithful data set is in base R
data(faithful)
head(faithful)
eruptions waiting
1 3.600 79
2 1.800 54
3 3.333 74
4 2.283 62
5 4.533 85
6 2.883 55
= as.matrix(faithful[, 1, drop = FALSE])
eruptions = as.matrix(faithful[, 2, drop = FALSE]) wait_times
Function
The fitting function.
<- function(
em_mixture
params,
X,clusters = 2,
tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments are starting parameters (means, covariances, cluster probability),
# data, number of clusters desired, tolerance, maximum iterations, and whether
# to show iterations
# Starting points
= nrow(X)
N = names(params)
nams = params$mu
mu = params$var
var = params$probs
probs
# Other initializations
# initialize cluster 'responsibilities', i.e. probability of cluster
# membership for each observation i
= matrix(0, ncol = clusters, nrow = N)
ri = 0
it = FALSE
converged
if (showits) # Show iterations
cat(paste("Iterations of EM:", "\n"))
while ((!converged) & (it < maxits)) {
= probs
probsOld = mu
muOld = var
varOld = ri
riOld
# E
# Compute responsibilities
for (k in 1:clusters){
= probs[k] * dnorm(X, mu[k], sd = sqrt(var[k]), log = FALSE)
ri[, k]
}
= ri/rowSums(ri)
ri
# M
= colSums(ri) # rk is the weighted average cluster membership size
rk = rk/N
probs = (t(X) %*% ri) / rk
mu = (t(X^2) %*% ri) / rk - mu^2
var
# could do mu and var via log likelihood here, but this is more straightforward
= rbind(probsOld, muOld, varOld)
parmlistold = rbind(probs, mu, var)
parmlistcurrent
= it + 1
it
# if showits true, & it =1 or divisible by 5 print message
if (showits & it == 1 | it%%5 == 0)
cat(paste(format(it), "...", "\n", sep = ""))
= max(abs(parmlistold - parmlistcurrent)) <= tol
converged
}
= which(round(ri) == 1, arr.ind = TRUE) # create cluster membership
clust = clust[order(clust[, 1]), 2] # order according to row rather than cluster
clust
= list(
out probs = probs,
mu = mu,
var = var,
resp = ri,
cluster = clust
)
out }
Estimation
Starting parameters require mean, variance and class probability. Note that starting values for mean must be within the data range or it will break.
= list(mu = c(2, 5),
start_values_1 var = c(1, 1),
probs = c(.5, .5))
= list(mu = c(50, 90),
start_values_2 var = c(1, 15),
probs = c(.5, .5))
= em_mixture(start_values_1, X = eruptions, tol = 1e-8) mix_erupt
Iterations of EM:
1...
5...
10...
15...
20...
25...
30...
= em_mixture(start_values_2, X = wait_times, tol = 1e-8) mix_waiting
Iterations of EM:
1...
5...
10...
15...
20...
25...
30...
35...
40...
45...
50...
55...
Comparison
Compare to flexmix package results.
library(flexmix)
= flexmix(eruptions ~ 1,
flex_erupt k = 2,
control = list(tolerance = 1e-8, iter.max = 100))
= flexmix(wait_times ~ 1,
flex_wait k = 2,
control = list(tolerance = 1e-8, iter.max = 100))
The following provides means, variances and probability of group membership. Note that the cluster label is arbitrary so cluster 1 for one model may be cluster 2 in another.
Eruptions
First we’ll compare results on the eruptions variable.
= rbind(mix_erupt$mu, sqrt(mix_erupt$var))
mean_var rownames(mean_var) = c('means', 'variances')
colnames(mean_var) = c('cluster 1', 'cluster 2')
= parameters(flex_erupt)
mean_var_flex rownames(mean_var_flex) = c('means', 'variances')
colnames(mean_var_flex) = c('cluster 1 flex', 'cluster 2 flex')
= mix_erupt$probs
prob_membership = flex_erupt@size / sum(flex_erupt@size)
prob_membership_flex
list(
params = cbind(mean_var, mean_var_flex),
clusterpobs = cbind(prob_membership, prob_membership_flex)
)
$params
cluster 1 cluster 2 cluster 1 flex cluster 2 flex
means 2.0186078 4.2733434 2.0186378 4.2733671
variances 0.2356218 0.4370631 0.2361087 0.4378364
$clusterpobs
prob_membership prob_membership_flex
1 0.3484046 0.3492647
2 0.6515954 0.6507353
Waiting
Now we compare the result for waiting times.
= rbind(mix_waiting$mu, sqrt(mix_waiting$var))
mean_var rownames(mean_var) = c('means', 'variances')
colnames(mean_var) = c('cluster 1', 'cluster 2')
= parameters(flex_wait)
mean_var_flex rownames(mean_var_flex) = c('means', 'variances')
colnames(mean_var_flex) = c('cluster 1 flex', 'cluster 2 flex')
= mix_waiting$probs
prob_membership = flex_wait@size / sum(flex_wait@size)
prob_membership_flex
list(
params = cbind(mean_var, mean_var_flex),
clusterpobs = cbind(prob_membership, prob_membership_flex)
)
$params
cluster 1 cluster 2 cluster 1 flex cluster 2 flex
means 54.614856 80.091069 54.616631 80.090988
variances 5.871219 5.867734 5.884843 5.879324
$clusterpobs
prob_membership prob_membership_flex
1 0.3608861 0.3639706
2 0.6391139 0.6360294
Visualization
Points are colored by class membership, followed by the probability of being in cluster 1.
Supplemental Example
This uses the MASS version (reversed columns). These don’t look even remotely the same data on initial inspection- geyser
is even more rounded and of opposite conclusion. Turns out geyser is offset by 1, such that duration 1 should be coupled with waiting 2 and on down. Still the rounding at 2 and 4 (and whatever division was done on duration) makes this fairly poor data.
I’ve cleaned this up a little bit in case someone wants to play with it for additional practice, but it’s not evaluated.
data(geyser, package = 'MASS')
= data.frame(duration = geyser$duration[-299], waiting = geyser$waiting[-1])
geyser
# compare to faithful
library(patchwork)
qplot(data = faithful, x = eruptions, y = waiting, alpha = I(.25)) /
qplot(data = geyser, x = duration, y = waiting, alpha = I(.25))
= matrix(geyser[,1])
X3 = matrix(geyser[,2])
X4
# MASS version
= em_mixture(start_values_1, X = X3, tol = 1e-8)
test3 = em_mixture(start_values_2, X = X4, tol = 1e-8)
test4
= flexmix(X3 ~ 1,
flexmod3 k = 2,
control = list(tolerance = 1e-8, iter.max = 100))
= flexmix(X4 ~ 1,
flexmod4 k = 2,
control = list(tolerance = 1e-8, iter.max = 100))
# note variability differences compared to faithful dataset
# Eruptions/Duration
= rbind(test3$mu, sqrt(test3$var))
mean_var rownames(mean_var) = c('means', 'variances')
= parameters(flexmod3)
mean_var_flex rownames(mean_var_flex) = c('means', 'variances')
= test3$probs
prob_membership = flexmod3@size / sum(flexmod3@size)
prob_membership_flex
list(
params = cbind(mean_var, mean_var_flex),
clusterpobs = cbind(prob_membership, prob_membership_flex)
)
# Waiting
= rbind(test4$mu, sqrt(test4$var))
mean_var rownames(mean_var) = c('means', 'variances')
= parameters(flexmod4)
mean_var_flex rownames(mean_var_flex) = c('means', 'variances')
= test4$probs
prob_membership = flexmod4@size / sum(flexmod4@size)
prob_membership_flex
list(
params = cbind(mean_var, mean_var_flex),
clusterpobs = cbind(prob_membership, prob_membership_flex)
)
# Some plots
library(ggplot2)
qplot(x = eruptions, y = waiting, data = faithful)
ggplot(aes(x = eruptions, y = waiting), data = faithful) +
geom_point(aes(color = factor(mix_waiting$cluster)))
ggplot(aes(x = eruptions, y = waiting), data = faithful) +
geom_point(aes(color = mix_waiting$resp[, 1]))
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20Mixture.R
Multivariate Mixture Model
The following code is based on algorithms noted in Murphy, 2012 Probabilistic Machine Learning. Specifically, Chapter 11, section 4.
Function
This estimating function will be used for both examples.
<- function(
em_mixture
params,
X,clusters = 2,
tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments are
# params: starting parameters (means, covariances, cluster probability)
# X: data
# clusters: number of clusters desired
# tol: tolerance
# maxits: maximum iterations
# showits: whether to show iterations
require(mvtnorm)
# Starting points
= nrow(X)
N = params$mu
mu = params$var
var = params$probs
probs
# initializations
# cluster 'responsibilities', i.e. probability of cluster membership for each
# observation i
= matrix(0, ncol=clusters, nrow=N)
ri = 0 # log likelihood
ll = 0 # iteration count
it = FALSE # convergence
converged
# Show iterations if showits == true
if (showits)
cat(paste("Iterations of EM:", "\n"))
while (!converged & it < maxits) {
= probs
probsOld # muOld = mu # Use direct values or loglike for convergence check
# varOld = var
= ll
llOld = ri
riOld
### E
# Compute responsibilities
for (k in 1:clusters){
= probs[k] * dmvnorm(X, mu[k, ], sigma = var[[k]], log = FALSE)
ri[,k]
}
= ri/rowSums(ri)
ri
### M
= colSums(ri) # rk is weighted average cluster membership size
rk = rk/N
probs
for (k in 1:clusters){
= matrix(0, ncol = ncol(X), nrow = ncol(X)) # initialize to sum matrices
varmat
for (i in 1:N){
= varmat + ri[i,k] * X[i,]%*%t(X[i,])
varmat
}
= (t(X) %*% ri[,k]) / rk[k]
mu[k,] = varmat/rk[k] - mu[k,]%*%t(mu[k,])
var[[k]]
= -.5*sum( ri[,k] * dmvnorm(X, mu[k,], sigma = var[[k]], log = TRUE) )
ll[k]
}
= sum(ll)
ll
# compare old to current for convergence
= c(llOld, probsOld) # c(muOld, unlist(varOld), probsOld)
parmlistold = c(ll, probs) # c(mu, unlist(var), probs)
parmlistcurrent = it + 1
it
# if showits true, & it =1 or modulo of 5 print message
if (showits & it == 1 | it%%5 == 0)
cat(paste(format(it), "...", "\n", sep = ""))
= min(abs(parmlistold - parmlistcurrent)) <= tol
converged
}
= which(round(ri) == 1, arr.ind = TRUE) # create cluster membership
clust = clust[order(clust[,1]), 2] # order accoring to row rather than cluster
clust
list(
probs = probs,
mu = mu,
var = var,
resp = ri,
cluster = clust,
ll = ll
) }
Example 1: Old Faithful
This example uses Old Faithful geyser eruptions as before. This is can be compared to the univariate code from the other chapter. See also http://www.geyserstudy.org/geyser.aspx?pGeyserNo=OLDFAITHFUL
Data Setup
library(tidyverse)
data("faithful")
Estimation
Create starting values and estimate.
= rbind(c(3, 60), c(3, 60.1)) # must be at least slightly different
mustart = list(cov(faithful), cov(faithful))
covstart = c(.01, .99)
probs
# params is a list of mu, var, and probs
= list(mu = mustart, var = covstart, probs = probs) starts
= em_mixture(
mix_faithful params = starts,
X = as.matrix(faithful),
clusters = 2,
tol = 1e-12,
maxits = 1500,
showits = TRUE
)
Iterations of EM:
1...
5...
10...
15...
20...
25...
30...
35...
40...
45...
50...
55...
60...
65...
70...
75...
80...
str(mix_faithful)
List of 6
$ probs : num [1:2] 0.356 0.644
$ mu : num [1:2, 1:2] 2.04 4.29 54.48 79.97
$ var :List of 2
..$ : num [1:2, 1:2] 0.0692 0.4352 0.4352 33.6973
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : NULL
.. .. ..$ : chr [1:2] "eruptions" "waiting"
..$ : num [1:2, 1:2] 0.17 0.941 0.941 36.046
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : NULL
.. .. ..$ : chr [1:2] "eruptions" "waiting"
$ resp : num [1:272, 1:2] 2.59e-09 1.00 8.42e-06 1.00 1.00e-21 ...
$ cluster: int [1:272] 2 1 2 1 2 1 2 2 1 2 ...
$ ll : num 477
Visualize.
Comparison
Compare to mclust results. Options are set to be more similar to the settings demonstrated.
library(mclust)
= mclust::Mclust(
mix_mclust 1:2],
faithful[, 2,
modelNames = 'VVV',
control = emControl(tol = 1e-12)
)
detach(package:mclust)
# str(mix_mclust, 1)
Compare means.
t(mix_faithful$mu)
[,1] [,2]
[1,] 2.036388 4.289662
[2,] 54.478516 79.968115
$parameters$mean mix_mclust
[,1] [,2]
eruptions 4.289662 2.036388
waiting 79.968115 54.478517
Compare variances.
$var mix_faithful
[[1]]
eruptions waiting
[1,] 0.06916767 0.4351676
[2,] 0.43516762 33.6972821
[[2]]
eruptions waiting
[1,] 0.1699684 0.9406093
[2,] 0.9406093 36.0462113
$parameters$variance$sigma mix_mclust
, , 1
eruptions waiting
eruptions 0.1699684 0.9406089
waiting 0.9406089 36.0462071
, , 2
eruptions waiting
eruptions 0.06916769 0.4351678
waiting 0.43516784 33.6972835
Compare classifications. Reverse in case arbitrary labeling of one of the clusters is opposite.
table(mix_faithful$cluster, mix_mclust$classification)
1 2
1 0 97
2 175 0
table(ifelse(mix_faithful$cluster == 2, 1, 2),
$classification) mix_mclust
1 2
1 175 0
2 0 97
# compare responsibilities; reverse one if arbitrary numbering of one of them is opposite
# cbind(round(mix_faithful$resp[,1], 2), round(mix_mclust$z[,2], 2)) # cluster '1'
# cbind(round(mix_faithful$resp[,2], 2), round(mix_mclust$z[,1], 2)) # cluster '2'
Example 2: Iris
Data Setup
Set up the data.
= iris %>% select(-Species) iris2
Estimation
Run and examine. We add noise to our starting value, and the function is notably sensitive to starts, but we don’t want to cheat too badly.
= iris %>%
mustart group_by(Species) %>%
summarise(across(.fns = function(x) mean(x) + runif(1, 0, .5))) %>%
select(-Species) %>%
as.matrix()
# use purrr::map due to mclust::map masking
= iris %>%
covstart split(.$Species) %>%
::map(select, -Species) %>%
purrr::map(function(x) cov(x) + diag(runif(4, 0, .5)))
purrr
= c(.1, .2, .7)
probs
= list(mu = mustart, var = covstart, probs = probs) starts
= em_mixture(
mix_mclust_iris params = starts,
X = as.matrix(iris2),
clusters = 3,
tol = 1e-8,
maxits = 1500,
showits = T
)
Iterations of EM:
1...
5...
table(mix_mclust_iris$cluster, iris$Species)
setosa versicolor virginica
1 50 0 0
2 0 50 7
3 0 0 43
Comparison
Compare to mclust results.
library(mclust)
= mclust::Mclust(iris[,1:4], 3)
mclust_iris table(mclust_iris$classification, iris$Species)
setosa versicolor virginica
1 50 0 0
2 0 45 0
3 0 5 50
detach(package:mclust)
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20Mixture%20MV.R
Probit Model
The following regards models for a binary response. See Murphy, 2012 Probabilistic Machine Learning Chapter 11.4.
Data Setup
library(tidyverse)
= haven::read_dta("https://stats.idre.ucla.edu/stat/stata/dae/binary.dta") admission
Probit via Maximum Likelihood
Function
We’ll start with the a basic maximum likelihood function for a standard probit. See the logistic regression and previous chapter on probit models for comparison.
<- function(params, X, y){
probit_mle # Arguments are starting parameters (coefficients), model matrix, response
= params
b = X %*% b # linear predictor
mu
# compute the log likelihood either way
# ll = sum(y * pnorm(mu, log.p = TRUE) + (1 - y) * pnorm(-mu, log.p = TRUE))
= sum(dbinom(y, 1, prob = pnorm(mu), log = TRUE))
ll
-ll
}
Estimation
Estimate with optim.
# input data
= as.matrix(cbind(1, admission[, 2:4]))
X = as.matrix(admission[, 1])
y = c(0, 0, 0, 0)
init
# Can set tolerance really low to duplicate glm result
= optim(
fit_mle par = init,
fn = probit_mle,
X = X,
y = y,
control = list(maxit = 1000, reltol = 1e-12)
)
# extract coefficients
= fit_mle$par coefs_mle
Comparison
Compare to glm.
= glm(
fit_glm ~ gre + gpa + rank,
admit family = binomial(link = "probit"),
control = list(maxit = 500, epsilon = 1e-8),
data = admission
)
summary(fit_glm)
Call:
glm(formula = admit ~ gre + gpa + rank, family = binomial(link = "probit"),
data = admission, control = list(maxit = 500, epsilon = 1e-08))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5626 -0.8920 -0.6403 1.1631 2.2097
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.0915039 0.6718360 -3.113 0.00185 **
gre 0.0013982 0.0006487 2.156 0.03112 *
gpa 0.4643599 0.1950263 2.381 0.01727 *
rank -0.3317117 0.0745524 -4.449 8.61e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98 on 399 degrees of freedom
Residual deviance: 459.48 on 396 degrees of freedom
AIC: 467.48
Number of Fisher Scoring iterations: 4
= coef(fit_glm) coefs_glm
Compare.
(Intercept) | gre | gpa | rank | |
---|---|---|---|---|
coefs_mle | -2.092 | 0.001 | 0.464 | -0.332 |
coefs_glm | -2.092 | 0.001 | 0.464 | -0.332 |
EM for Latent Variable Approach to Probit
Now for the EM approach, which assumes an continuous latent variable underlying the binary target.
Function
<- function(
em_probit
params,
X,
y,tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments
# params: starting parameters (coefficients)
# X: model matrix
# y: response
# tol: tolerance,
# maxits: maximum iterations
# showits: whether to show iterations
#starting points
= params
b = X%*%b
mu = 0
it = FALSE
converged = rnorm(length(y)) # z is the latent variable ~N(0,1)
z
# Show iterations
if (showits)
cat(paste("Iterations of EM:", "\n"))
# while no convergence and we haven't reached our max iterations do this stuff
while ((!converged) & (it < maxits)) {
= z # create 'old' values for comparison
z_old
# E step create a new z based on current values
= ifelse(
z == 1,
y + dnorm(mu) / pnorm(mu),
mu - dnorm(mu) / pnorm(-mu)
mu
)
# M step estimate b
= solve(t(X)%*%X) %*% t(X)%*%z
b = X%*%b
mu
= sum(y * pnorm(mu, log.p = TRUE) + (1 - y) * pnorm(-mu, log.p = TRUE))
ll
= it + 1
it
if (showits & (it == 1 | it%%5 == 0))
cat(paste(format(it), "...", "\n", sep = ""))
= max(abs(z_old - z)) <= tol
converged
}
# Show last iteration
if (showits)
cat(paste0(format(it), "...", "\n"))
list(b = t(b), ll = ll)
}
Estimation
Use the same setup and starting values to estimate the parameters.
# can lower tolerance to duplicate glm result
= em_probit(
fit_em params = init,
X = X,
y = y,
tol = 1e-12,
maxit = 100
)
Iterations of EM:
1...
5...
10...
15...
20...
25...
30...
35...
40...
45...
50...
51...
# fit_em
= fit_em$b coefs_em
Comparison
Compare all results.
fit | (Intercept) | gre | gpa | rank | logLik |
---|---|---|---|---|---|
glm | -2.092 | 0.001 | 0.464 | -0.332 | -229.74 |
mle | -2.092 | 0.001 | 0.464 | -0.332 | 229.74 |
em | -2.092 | 0.001 | 0.464 | -0.332 | -229.74 |
Visualize
Show estimates over niter iterations and visualize.
= X
X2 2:3] = scale(X2[, 2:3])
X2[,
= 20
niter
= map_df(1:niter, function(x)
fit_em as_tibble(
em_probit(
params = init,
X = X2,
y = y,
tol = 1e-8,
maxit = x,
showits = F
$b)
)
)
= fit_em %>%
gdat rowid_to_column('iter') %>%
pivot_longer(-iter, names_to = 'coef') %>%
mutate(
coef = factor(coef, labels = c('Intercept', 'gre', 'gpa', 'rank'))
%>%
) arrange(iter, coef)
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20algorithm%20for%20probit%20example.R
PCA
The following is an EM algorithm for principal components analysis. See Murphy, 2012 Probabilistic Machine Learning 12.2.5. Some of the constructed object is based on output from pca function used below.
Data Setup
The state.x77
is from base R, which includes various state demographics. We
will first standardize the data.
library(tidyverse)
= scale(state.x77) X
Function
The estimating function. Note that it uses orth from pracma, but I show the core of the underlying code if you don’t want to install it.
# orth <- function(M) {
# svdM = svd(M)
# U = svdM$u
# s = svdM$d
# tol = max(dim(M)) * max(s) * .Machine$double.eps
# r = sum(s > tol)
#
# U[, 1:r, drop = FALSE]
# }
<- function(
em_pca
X,n_comp = 2,
tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments
# X: numeric data
# n_comp: number of components
# tol = tolerance level
# maxits: maximum iterations
# showits: show iterations
# starting points and other initializations
= nrow(X)
N = ncol(X)
D = n_comp
L = t(X)
Xt = t(replicate(L, rnorm(N))) # latent variables
Z = replicate(L, rnorm(D)) # loadings
W = 0
it = FALSE
converged
if (showits)
cat(paste("Iterations of EM:", "\n"))
# while no convergence and we haven't reached our max iterations do this stuff
while ((!converged) & (it < maxits)) {
= Z # create 'old' values for comparison
Z_old = solve(t(W)%*%W) %*% crossprod(W, Xt) # E
Z = Xt%*%t(Z) %*% solve(tcrossprod(Z)) # M
W
= it + 1
it
# if showits, show first and every 5th iteration
if (showits & (it == 1 | it%%5 == 0))
cat(paste(format(it), "...", "\n", sep = ""))
= max(abs(Z_old-Z)) <= tol
converged
}
# calculate reconstruction error
= W %*% Z
Xrecon_em = sum((Xrecon_em - t(X))^2)
reconerr
# orthogonalize
= pracma::orth(W) # for orthonormal basis of W; pcaMethods package has also
W = eigen(cov(X %*% W))
evs = evs$values
evals = evs$vectors
evecs
= W %*% evecs
W = X %*% W
Z
if (showits) # Show last iteration
cat(paste0(format(it), "...", "\n"))
list(
scores = Z,
loadings = W,
reconerr = reconerr,
Xrecon_em = t(Xrecon_em)
) }
Estimation
= em_pca(
fit_em X = X,
n_comp = 2,
tol = 1e-12,
maxit = 1000
)
Iterations of EM:
1...
5...
10...
15...
20...
25...
30...
35...
40...
45...
50...
55...
60...
65...
70...
74...
str(fit_em) # examine results
List of 4
$ scores : num [1:50, 1:2] 3.79 -1.053 0.867 2.382 0.241 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : NULL
$ loadings : num [1:8, 1:2] 0.126 -0.299 0.468 -0.412 0.444 ...
$ reconerr : num 136
$ Xrecon_em: num [1:50, 1:8] 0.383 2.109 0.416 -0.228 1.472 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : chr [1:8] "Population" "Income" "Illiteracy" "Life Exp" ...
Comparison
Extract reconstructed values and loadings for comparison.
= fit_em$Xrecon_em
Xrecon_em = fit_em$loadings
loadings_em = fit_em$scores scores_em
Compare results to output from pcaMethods, which also has probabilistic PCA (demonstrated next). Note that the signs for loadings/scores may be different in sign, but otherwise should be comparable.
library(pcaMethods) # install via BiocManager::install("pcaMethods")
= pca(
fit_pcam
X,nPcs = 2,
method = 'svd',
scale = 'none',
center = FALSE
)
= loadings(fit_pcam)
loadings_pcam = scores(fit_pcam) scores_pcam
Compare loadings and scores.
sum((abs(loadings_pcam) - abs(loadings_em))^2)
[1] 2.166071e-24
cbind(scores_pcam, data.frame(EM = scores_em)) %>%
head()
PC1 PC2 EM.1 EM.2
Alabama 3.7898873 -0.2347790 3.7898873 -0.2347790
Alaska -1.0531355 5.4561751 -1.0531355 5.4561751
Arizona 0.8674288 0.7450615 0.8674288 0.7450615
Arkansas 2.3817776 -1.2883437 2.3817776 -1.2883437
California 0.2413815 3.5095228 0.2413815 3.5095228
Colorado -2.0621814 0.5056639 -2.0621814 0.5056639
Calculate mean squared reconstruction error and compare.
= scores_pcam %*% t(loadings_pcam)
Xrecon_pcam
mean((Xrecon_em - X)^2)
[1] 0.3392252
mean((Xrecon_pcam - X)^2)
[1] 0.3392252
mean(abs(Xrecon_pcam - Xrecon_em))
[1] 6.109616e-13
Visualize
# qplot(Xrecon_pcam[,1], X[,1])
# qplot(Xrecon_pcam[,2], X[,2])
qplot(Xrecon_em[,1], Xrecon_pcam[,1])
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20for%20pca.R
Probabilistic PCA
The following is an EM algorithm for probabilistic principal components analysis. Based on Tipping and Bishop, 1999, and also Murphy 2012 Probabilistic ML, with some code snippets inspired by the ppca function used below. See also standard PCA.
Data Setup
state.x77
is from base R, which includes various state demographics. We will
first standardize the data.
library(tidyverse)
= scale(state.x77) X
Function
The estimating function. Note that it uses orth from pracma, but I show the core of the underlying code if you don’t want to install it.
<- function(M) {
orth = svd(M)
svdM = svdM$u
U = svdM$d
s = max(dim(M)) * max(s) * .Machine$double.eps
tol = sum(s > tol)
r
1:r, drop = FALSE]
U[,
}
<- function(
em_ppca
X,n_comp = 2,
tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments
# X: numeric data
# n_comp: number of components
# tol = tolerance level
# maxits: maximum iterations
# showits: show iterations
# require(pracma)
<- function(x) sum(diag(x), na.rm = TRUE) # matrix trace
tr
# starting points and other initializations
= nrow(X)
N = ncol(X)
D = n_comp
L
= (1/N) * t(X)%*%X
S
= eigen(S)$values
evals = eigen(S)$vectors
evecs
= evecs[,1:L]
V = diag(evals[1:L])
Lambda
# latent variables
= t(replicate(L, rnorm(N)))
Z
# variance; average variance associated with discarded dimensions
= 1/(D - L) * sum(evals[(L+1):D])
sigma2
# loadings; this and sigma2 starting points will be near final estimate
= V %*% chol(Lambda - sigma2 * diag(L)) %*% diag(L)
W
= 0
it = FALSE
converged = 0
ll
# Show iterations
if (showits)
cat(paste("Iterations of EM:", "\n"))
while ((!converged) & (it < maxits)) {
# create 'old' values for comparison
if(exists('W_new')){
= W_new
W_old
}else {
= W
W_old
}
= ll
ll_old
= sigma2*diag(L)
Psi = t(W_old) %*% W_old + Psi
M
# E and M
= S %*% W_old %*% solve( Psi + solve(M) %*% t(W_old) %*% S %*% W_old )
W_new = 1/D * tr(S - S %*% W_old %*% solve(M) %*% t(W_new))
sigma2
= solve(M) %*% t(W_new) %*% t(X)
Z = sigma2*solve(M) + Z%*%t(Z)
ZZ
# log likelihood as in paper
# ll = .5*sigma2*D + .5*tr(ZZ) + .5*sigma2 * X%*%t(X) -
# 1/sigma2 * t(Z)%*%t(W_new)%*%t(X) + .5*sigma2 * tr(t(W_new)%*%W_new%*%ZZ)
# ll = -sum(ll)
# more straightforward
= dnorm(X, mean = t(W_new %*% Z), sd = sqrt(sigma2), log = TRUE)
ll = -sum(ll)
ll
= it + 1
it
# if showits, show first and every 5th iteration
if (showits & (it == 1 | it%%5 == 0))
cat(paste(format(it), "...", "\n", sep = ""))
= max(abs(ll_old-ll)) <= tol
converged
}
= pracma::orth(W_new) # for orthonormal basis of W; pcaMethods package has also
W = eigen(cov(X %*% W))
evs = evs$vectors
evecs
= W %*% evecs
W = X %*% W
Z = Z %*% t(W)
Xrecon = sum((Xrecon - X)^2)
reconerr
if (showits) # Show last iteration
cat(paste0(format(it), "...", "\n"))
list(
scores = Z,
loadings = W,
Xrecon = Xrecon,
reconerr = reconerr,
ll = ll,
sigma2 = sigma2
) }
Estimation
= em_ppca(
fit_em X = X,
n_comp = 2,
tol = 1e-12,
maxit = 100
)
Iterations of EM:
1...
2...
str(fit_em)
List of 6
$ scores : num [1:50, 1:2] 3.79 -1.053 0.867 2.382 0.241 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : NULL
$ loadings: num [1:8, 1:2] 0.126 -0.299 0.468 -0.412 0.444 ...
$ Xrecon : num [1:50, 1:8] 0.383 2.109 0.416 -0.228 1.472 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : NULL
$ reconerr: num 136
$ ll : num 369
$ sigma2 : num 0.452
Comparison
Extract reconstructed values and loadings for comparison.
= fit_em$Xrecon
Xrecon = fit_em$loadings
loadings_em = fit_em$scores scores_em
Compare to standard pca on full data set if desired.
= princomp(scale(state.x77))
standard_pca
= standard_pca$scores[,1:2]
scores_standard_pca = standard_pca$loadings[,1:2]
loadings_standard_pca = scores_standard_pca%*%t(loadings_standard_pca) Xrecon_standard_pca
Compare results to output from pcaMethods, which also has probabilistic PCA (demonstrated next). Note that the signs for loadings/scores may be different
library(pcaMethods)
= pca(
fit_pcam
X,nPcs = 2,
threshold = 1e-8,
method = 'ppca',
scale = 'none',
center = FALSE
)
= loadings(fit_pcam)
loadings_pcam = scores(fit_pcam) scores_pcam
Compare loadings and scores.
round(cbind(loadings_pcam, loadings_em, loadings_standard_pca), 3)
PC1 PC2 Comp.1 Comp.2
Population 0.126 0.411 0.126 0.411 0.126 0.411
Income -0.299 0.519 -0.299 0.519 -0.299 0.519
Illiteracy 0.468 0.053 0.468 0.053 0.468 0.053
Life Exp -0.412 -0.082 -0.412 -0.082 -0.412 -0.082
Murder 0.444 0.307 0.444 0.307 0.444 0.307
HS Grad -0.425 0.299 -0.425 0.299 -0.425 0.299
Frost -0.357 -0.154 -0.357 -0.154 -0.357 -0.154
Area -0.033 0.588 -0.033 0.588 -0.033 0.588
sum((abs(loadings_pcam) - abs(loadings_em)) ^ 2)
[1] 1.324404e-18
cbind(scores_pcam, data.frame(EM = scores_em)) %>%
head()
PC1 PC2 EM.1 EM.2
Alabama 3.7898873 -0.2347790 3.7898873 -0.2347790
Alaska -1.0531355 5.4561751 -1.0531355 5.4561751
Arizona 0.8674288 0.7450615 0.8674288 0.7450615
Arkansas 2.3817776 -1.2883437 2.3817776 -1.2883437
California 0.2413815 3.5095228 0.2413815 3.5095228
Colorado -2.0621814 0.5056639 -2.0621814 0.5056639
Compare reconstructed data sets.
= scores_pcam %*% t(loadings_pcam)
Xrecon_pcam
mean((Xrecon_pcam - X)^2)
[1] 0.3392252
mean(abs(Xrecon_pcam - Xrecon))
[1] 3.905797e-10
mean(abs(Xrecon_pcam - Xrecon))
[1] 3.905797e-10
Visualize
Show the reconstructed vs. observed data.
Compare component scores.
Missing Data Example
A slightly revised approach can be taken in the case of missing values.
Data Setup
# create some missing values
set.seed(123)
= X
X_miss = sample(length(X), 20)
NAindex = NA X_miss[NAindex]
Function
This estimating function largely follows the previous
<- function(
em_ppca_miss
X,n_comp = 2,
tol = .00001,
maxits = 100,
showits = TRUE
) {
# Arguments
# X: numeric data
# n_comp: number of components
# tol: tolerance level
# maxits: maximum iterations
# showits: show iterations
# require(pracma) # for orthonormal basis of W; pcaMethods package has also, see basic orth function
<- function(x) sum(diag(x), na.rm = TRUE) # matrix trace
tr
# starting points and other initializations
= X
X_orig = X
X = nrow(X_orig)
N = ncol(X_orig)
D = n_comp
L = is.na(X_orig)
NAs
= 0
X[NAs] = (1/N) * t(X)%*%X
S
= eigen(S)$values
evals = eigen(S)$vectors
evecs
= evecs[,1:L]
V = diag(evals[1:L])
Lambda
# latent variables
= t(replicate(L, rnorm(N)))
Z
# variance; average variance associated with discarded dimensions
= 1/(D-L) * sum(evals[(L+1):D])
sigma2
# loadings
= V %*% chol(Lambda-sigma2*diag(L)) %*% diag(L)
W
= 0
it = FALSE
converged = 0
ll
# Show iterations
if (showits)
cat(paste("Iterations of EM:", "\n"))
while ((!converged) & (it < maxits)) {
if(exists('W_new')){
= W_new
W_old
}else {
= W
W_old
}
= ll
ll_old
# deal with missingness via projection
= t(W_old%*%Z)
proj = X_orig
X_new = proj[NAs]
X_new[NAs] = X_new
X
= sigma2*diag(L)
Psi = t(W_old) %*% W_old + Psi
M
# E and M
= S %*% W_old %*% solve( Psi + solve(M)%*%t(W_old)%*%S%*%W_old )
W_new = 1/D * tr(S - S%*%W_old%*%solve(M)%*%t(W_new))
sigma2
= solve(M)%*%t(W_new)%*%t(X)
Z
# log likelihood as in paper
# ZZ = sigma2*solve(M) + Z%*%t(Z)
# ll = .5*sigma2*D + .5*tr(ZZ) + .5*sigma2 * X%*%t(X) -
# 1/sigma2 * t(Z)%*%t(W_new)%*%t(X) + .5*sigma2 * tr(t(W_new)%*%W_new%*%ZZ)
# ll = -sum(ll)
# more straightforward
= dnorm(X, mean = t(W_new %*% Z), sd = sqrt(sigma2), log = TRUE)
ll = -sum(ll)
ll
= it + 1
it
# if showits, show first and every 5th iteration
if (showits & (it == 1 | it%%5 == 0))
cat(paste(format(it), "...", "\n", sep = ""))
= max(abs(ll_old-ll)) <= tol
converged
}
= pracma::orth(W_new) # for orthonormal basis of W
W = eigen(cov(X %*% W))
evs = evs$vectors
evecs
= W %*% evecs
W = X %*% W
Z
= Z %*% t(W)
Xrecon = sum((Xrecon-X)^2)
reconerr
if (showits) # Show last iteration
cat(paste0(format(it), "...", "\n"))
list(
scores = Z,
loadings = W,
Xrecon = Xrecon,
reconerr = reconerr,
ll = ll,
sigma2 = sigma2
) }
Estimation
Run the PCA.
= em_ppca_miss(
fit_em_miss X = X_miss,
n_comp = 2,
tol = 1e-8,
maxit = 100
)
Iterations of EM:
1...
5...
10...
15...
19...
str(fit_em_miss)
List of 6
$ scores : num [1:50, 1:2] 3.79 -1.07 0.86 2.35 0.24 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : NULL
$ loadings: num [1:8, 1:2] 0.133 -0.299 0.422 -0.431 0.464 ...
$ Xrecon : num [1:50, 1:8] 0.414 1.998 0.448 -0.2 1.521 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
.. ..$ : NULL
$ reconerr: num 130
$ ll : num 368
$ sigma2 : num 0.475
Comparison
Extract reconstructed values and loadings for comparison.
= fit_em_miss$Xrecon
Xrecon = fit_em_miss$loadings
loadings_em = fit_em_miss$scores scores_em
Compare to standard pca on full data set if desired.
= prin_comp(scale(state.x77))
standard_pca
= standard_pca$scores[,1:2]
scores_standard_pca = standard_pca$loadings[,1:2]
loadings_standard_pca = scores_standard_pca%*%t(loadings_standard_pca) Xrecon_standard_pca
Compare results to output from pcaMethods, which also has probabilistic PCA (demonstrated next). Note that the signs for loadings/scores may be different
library(pcaMethods)
= pca(
fit_pcam
X_miss,nPcs = 2,
threshold = 1e-8,
method = 'ppca',
scale = 'none',
center = FALSE
)
= loadings(fit_pcam)
loadings_pcam = scores(fit_pcam) scores_pcam
Compare loadings and scores.
round(cbind(loadings_pcam, loadings_em, loadings_standard_pca), 3)
PC1 PC2 Comp.1 Comp.2
Population -0.128 -0.416 0.133 0.396 0.126 0.411
Income 0.305 -0.492 -0.299 0.537 -0.299 0.519
Illiteracy -0.434 -0.046 0.422 0.076 0.468 0.053
Life Exp 0.432 0.077 -0.431 -0.080 -0.412 -0.082
Murder -0.464 -0.284 0.464 0.290 0.444 0.307
HS Grad 0.424 -0.288 -0.433 0.318 -0.425 0.299
Frost 0.346 0.170 -0.353 -0.185 -0.357 -0.154
Area 0.041 -0.620 -0.037 0.569 -0.033 0.588
sum((abs(loadings_pcam) - abs(loadings_em)) ^ 2)
[1] 0.00738241
cbind(scores_pcam, data.frame(EM = scores_em)) %>%
head()
PC1 PC2 EM.1 EM.2
Alabama -3.7959034 0.2274074 3.7935243 -0.2316100
Alaska 1.0806631 -5.4908857 -1.0714535 5.4128832
Arizona -0.8672862 -0.7758984 0.8599622 0.8425754
Arkansas -2.3599676 1.2461202 2.3527582 -1.3002423
California -0.1899324 -4.0171723 0.2399266 3.7656674
Colorado 1.4732282 -0.4045470 -1.4689157 0.4135982
Compare reconstructed data sets.
= scores_pcam %*% t(loadings_pcam)
Xrecon_pcam
mean((Xrecon_pcam[NAindex]-X[NAindex])^2)
[1] 0.5047378
mean(abs(Xrecon_pcam - Xrecon))
[1] 0.02893291
Visualize
Visualize as before
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20algorithm%20for%20ppca.R
Original code for the missing example found at (https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/EM%20Examples/EM%20algorithm%20for%20ppca%20with%20missing.R
State Space Model
The following regards chapter 11 in Statistical Modeling and Computation, the first example for an unobserved components model. The data regards inflation based on the U.S. consumer price index (\(\textrm{infl} = 400*log(cpi_t/cpi_{t-1})\)), from the second quarter of 1947 to the second quarter of 2011. You can acquire the data here or in Datasets repo. Just note that it has 2 mystery columns and one mystery row presumably supplied by Excel. You can also get the CPI data yourself at the Bureau of Labor Statistics in a frustrating fashion, or in a much easier fashion here.
For the following I use n
instead of t
or T
because those are transpose and TRUE
in R. The model is basically y = τ + ϵ, with ϵ ~ N(0, σ2), and τ = τn-1 +
υ_n with υ ~ N(0, ω2). Thus each y is associated with a latent variable that
follows a random walk over time. ω2 serves as a smoothing parameter, which
itself may be estimated but which is fixed in the following. See the text for
more details.
Data Setup
library(tidyverse)
= read_csv(
d 'https://raw.githubusercontent.com/m-clark/Datasets/master/us%20cpi/USCPI.csv',
col_names = FALSE
)
= as.matrix(d$X1)
inflation
summary(inflation)
V1
Min. :-9.557
1st Qu.: 1.843
Median : 3.248
Mean : 3.634
3rd Qu.: 4.819
Max. :15.931
Function
EM function for a state space model.
<- function(
em_state_space
params,
y,
omega2_0,
omega2,tol = .00001,
maxits = 100,
showits = FALSE
) {
# Arguments
# params: starting parameters (variance as 'sigma2'),
# y: data,
# tol: tolerance,
# omega2: latent variance (2_0) is a noisier starting variance
# maxits: maximum iterations
# showits: whether to show iterations
# Not really needed here, but would be a good idea generally to take advantage
# of sparse representation for large data
# require(spam) # see usage below
# Starting points
= length(y)
n = params$sigma2
sigma2
# Other initializations
= diag(n)
H
for (i in 1:(ncol(H) - 1)) {
+ 1, i] = -1
H[i
}
= spam::as.spam(diag(omega2, n))
Omega2 1, 1] = omega2_0
Omega2[
= spam::as.spam(H)
H = t(H) %*% spam::chol2inv(spam::chol(Omega2)) %*% H # tau ~ N(0, HinvOmmega2H^-1)
HinvOmega2H
= 0
it = FALSE
converged
if (showits) # Show iterations
cat(paste("Iterations of EM:", "\n"))
while ((!converged) & (it < maxits)) {
= sigma2[1]
sigma2Old = diag(n)/sigma2Old
Sigma2invOld
= HinvOmega2H + Sigma2invOld # E
K = solve(K, y/sigma2Old) # tau|y, sigma2_{n-1}, omega2 ~ N(0, K^-1)
tau = sum(1/eigen(K)$values)
K_inv_tr
= 1/n * (K_inv_tr + crossprod(y-tau)) # M
sigma2
= max(abs(sigma2 - sigma2Old)) <= tol
converged
= it + 1
it
# if showits true, & it =1 or divisible by 5 print message
if (showits & it == 1 | it%%5 == 0)
cat(paste(format(it), "...", "\n", sep = ""))
}
= HinvOmega2H + diag(n) / sigma2[1]
Kfinal = solve(K, (y / sigma2[1]))
taufinal
list(sigma2 = sigma2, tau = taufinal)
}
Estimation
= em_state_space(
ss_mod_1 params = data.frame(sigma2 = var(inflation)),
y = inflation,
tol = 1e-10,
omega2_0 = 9,
omega2 = 1^2
)
5...
10...
15...
20...
25...
30...
.5 = em_state_space(
ss_mod_params = data.frame(sigma2 = var(inflation)),
y = inflation,
tol = 1e-10,
omega2_0 = 9,
omega2 = .5^2
)
5...
10...
15...
20...
# more smooth
.1 = em_state_space(
ss_mod_params = data.frame(sigma2 = var(inflation)),
y = inflation,
tol = 1e-10,
omega2_0 = 9,
omega2 = .1^2
)
5...
10...
$sigma2 ss_mod_1
[,1]
[1,] 2.765182
.5$sigma2 ss_mod_
[,1]
[1,] 4.404707
.1$sigma2 ss_mod_
[,1]
[1,] 7.489429
Visualization
library(lubridate)
= ymd(
series paste0(
rep(1947:2014, e = 4),
'-',
c('01', '04', '07', '10'),
'-',
'01')
)
The following corresponds to Fig. 11.1 in the text. We’ll also compare to generalized additive model via geom_smooth (greenish line). We can see the .1 model (light blue) is overly smooth.
library(tidyverse)
data.frame(
series = series[1:length(inflation)],
inflation = inflation,
Mod_1 = ss_mod_1$tau,
Mod_.5 = ss_mod_.5$tau,
Mod_.1 = ss_mod_.1$tau
%>%
) ggplot(aes(x = series, y = inflation)) +
geom_point(color = 'gray50', alpha = .25) +
geom_line(aes(y = Mod_1), color = '#ff5500') +
geom_line(aes(y = Mod_.5), color = '#9E1B34') +
geom_line(aes(y = Mod_.1), color = '#00aaff') +
geom_smooth(
formula = y ~ s(x, bs = 'gp', k = 50),
se = FALSE,
color = '#1b9e86',
method = 'gam',
size = .5
+
) scale_x_date(date_breaks = '10 years') +
labs(x = '')