Bayesian Nonparametric Models
The following provides some conceptual code for the Chinese restaurant and Indian buffet process for categorical and continuous/combinations of categorical latent variables respectively. For more detail, see the Bayesian nonparametric section of my structural equation modeling document.
Chinese Restaurant Process
To start, we have a couple functions demonstrating the Chinese restaurant process. The first is succinct and more conceptual, but notably slower.
<- function(alpha, n) {
crp = 1
table_assignments
for (i in 2:n){
= table(table_assignments) # counts of table assignments
table_counts = length(table_counts) # number of tables
nt = table_counts/(i - 1 + alpha) # probabilities of previous table assignments
table_prob
# sample assignment based on probability of current tables and potential next table
= sample(1:(nt+1), 1, prob = c(table_prob, 1 - sum(table_prob)))
current_table_assignment
# concatenate new to previous table assignments
= c(table_assignments, current_table_assignment)
table_assignments
}
table_assignments }
The following function is similar to the restaurant function here https://github.com/mcdickenson/shinyapps, and notably faster.
<- function(alpha, n) {
crpF = c(1, rep(NA, n-1))
table_assignments = 1
table_counts
for (i in 2:n){
= c(table_counts, alpha)
init
= init/sum(init)
table_prob
= sample(seq_along(init), 1, prob = table_prob)
current_table_assignment
= current_table_assignment
table_assignments[i]
if (current_table_assignment == length(init)) {
= 1
table_counts[current_table_assignment] else {
} = table_counts[current_table_assignment] + 1
table_counts[current_table_assignment]
}
}
table_assignments
}
# library(microbenchmark)
# test = microbenchmark(crp(alpha = 1, n = 1000),
# crpF(alpha = 1, n = 1000), times = 100)
# test
# ggplot2::autoplot(test)
Visualize some examples at a given setting.
= replicate(5 , crpF(alpha = 1, n = 500), simplify = FALSE)
out
library(tidyverse)
map_df(
out, function(x) data.frame(table(x)),
.id = 'result'
%>%
) rename(cluster = x) %>%
ggplot(aes(cluster, Freq)) +
geom_col() +
facet_grid(~ result)
Visualize cluster membership. With smaller alpha
, there is more tendency to stick to fewer clusters.
set.seed(123)
= 100
n
= crp(alpha = 1, n = n)
crp_1
= matrix(0, nrow = n, ncol = n_distinct(crp_1))
crp_1_mat
for (i in 1:n_distinct(crp_1)) {
= ifelse(crp_1 == i, 1, 0)
crp_1_mat[, i]
}
= crp(alpha = 5, n = n)
crp_4
= matrix(0, nrow = n, ncol = n_distinct(crp_4))
crp_4_mat
for (i in 1:n_distinct(crp_4)) {
= ifelse(crp_4 == i, 1, 0)
crp_4_mat[, i] }
::heatmaply(
heatmaply
crp_1_mat,Rowv = FALSE,
Colv = FALSE,
colors = scico::scico(n = 256, alpha = 1, begin = 0, end = 1),
width = 400
)
::heatmaply(
heatmaply
crp_4_mat,Rowv = FALSE,
Colv = FALSE,
colors = scico::scico(n = 256, alpha = 1, begin = 0, end = 1),
width = 400
)
Indian Buffet Process
The following demonstrates the Indian buffet process for continuous latent variable settings.
<- function(alpha, N){
ibp # preallocate assignments with upper bound of N*alpha number of latent factors
= matrix(NA, nrow = N, ncol = N*alpha)
assignments
# start with some dishes/assignments
= rpois(1, alpha)
dishes = ncol(assignments) - dishes # fill in the rest of potential dishes
zeroes 1, ] = c(rep(1, dishes), rep(0, zeroes))
assignments[
for(i in 2:N){
= i - 1
prev # esoteric line that gets the last dish sampled without a search for it
= sum(colSums(assignments[1:prev, , drop = FALSE]) > 0)
last_previously_sampled_dish
# initialize
= matrix(0, nrow=1, ncol=last_previously_sampled_dish)
dishes_previously_sampled
# calculate probability of sampling from previous dishes
= colSums(assignments[1:prev, 1:last_previously_sampled_dish, drop = FALSE]) / i
dish_prob 1, ] = rbinom(n = last_previously_sampled_dish,
dishes_previously_sampled[size = 1,
prob = dish_prob)
# sample new dish and assign based on results
= rpois(1, alpha/i)
new_dishes = ncol(assignments) - (last_previously_sampled_dish + new_dishes)
zeroes = c(dishes_previously_sampled, rep(1,new_dishes), rep(0, zeroes))
assignments[i,]
}
# return only the dimensions sampled
= sum(colSums(assignments[1:prev,]) > 0)
last_sampled_dish
1:last_sampled_dish]
assignments[, }
As before, we can compare different settings.
set.seed(123)
= ibp(1, 100)
ibp_1 = ibp(5, 100)
ibp_4
::heatmaply(
heatmaply
ibp_1,Rowv = FALSE,
Colv = FALSE,
colors = scico::scico(n = 256, alpha = 1, begin = 0, end = 1),
width = 400
)
::heatmaply(
heatmaply
ibp_4,Rowv = FALSE,
Colv = FALSE,
colors = scico::scico(n = 256, alpha = 1, begin = 0, end = 1),
width = 400
)
Source
Original code available at: https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/crp.R