# Hamiltonian Monte Carlo

The following demonstrates Hamiltonian Monte Carlo, the technique that Stan uses, and which is a different estimation approach than the Gibbs sampler in BUGS/JAGS. If you are interested in the details enough to be reading this, I highly recommend Betancourt’s conceptual introduction to HMC. This example is largely based on the code in the appendix of Gelman.

• Gelman et al. 2013. Bayesian Data Analysis. 3rd ed.

## Data Setup

As with the Metropolis Hastings chapter, we use some data for a standard linear regression.

library(tidyverse)

# set seed for replicability
set.seed(8675309)

# create a N x k matrix of covariates
N = 250
K = 3

covariates = replicate(K, rnorm(n = N))
colnames(covariates) = c('X1', 'X2', 'X3')

# create the model matrix with intercept
X = cbind(Intercept = 1, covariates)

# create a normally distributed variable that is a function of the covariates
coefs = c(5, .2, -1.5, .9)
sigma = 2
mu = X %*% coefs
y  = rnorm(N, mu, sigma)

# same as
# y = 5 + .2*X1 - 1.5*X2 + .9*X3 + rnorm(N, mean = 0, sd = 2)

# Run lm for later comparison; but go ahead and examine now if desired
fit_lm = lm(y ~ ., data = data.frame(X[, -1]))
# summary(fit_lm)

## Functions

log_posterior <- function(X, y, th) {
# Args
# X: the model matrix
# y: the target vector
# th: theta, the current parameter estimates

beta   = th[-length(th)]            # reg coefs to be estimated
sigma  = th[length(th)]             # sigma to be estimated
sigma2 = sigma^2
mu = X %*% beta

# priors are b0 ~ N(0, sd=10), sigma2 ~ invGamma(.001, .001)
priorbvarinv = diag(1/100, 4)
prioralpha   = priorbeta = .001

if (is.nan(sigma) | sigma<=0) {     # scale parameter must be positive, so post
return(-Inf)                      # density is zero if it jumps below zero
}
# log posterior in this conjugate setting. conceptually it's (log) prior +
# (log) likelihood. (See commented 'else' for alternative)
else {
-.5*nrow(X)*log(sigma2) - (.5*(1/sigma2) * (crossprod(y-mu))) +
-.5*ncol(X)*log(sigma2) - (.5*(1/sigma2) * (t(beta) %*% priorbvarinv %*% beta)) +
-(prioralpha + 1)*log(sigma2) + log(sigma2) - priorbeta/sigma2
}
# else {
#   ll = mvtnorm::dmvnorm(y, mean=mu, sigma=diag(sigma2, length(y)), log=T)
#   priorb = mvtnorm::dmvnorm(beta, mean=rep(0, length(beta)), sigma=diag(100, length(beta)), log=T)
#   priors2 = dgamma(1/sigma2, prioralpha, priorbeta, log=T)
#   logposterior = ll + priorb + priors2
#   logposterior
# }
}

The following is the numerical gradient function as given in BDA3 p. 602. It has the same arguments as the log posterior function.

gradient_theta <- function(X, y, th) {
d = length(th)
e = .0001
diffs = numeric(d)

for (k in 1:d) {
th_hi = th
th_lo = th
th_hi[k] = th[k] + e
th_lo[k] = th[k] - e
diffs[k] = (log_posterior(X, y, th_hi) - log_posterior(X, y, th_lo)) / (2 * e)
}

diffs
}

The following is a function for a single HMC iteration. ϵ and L are drawn randomly at each iteration to explore other areas of the posterior (starting with epsilon0 and L0); The mass matrix M, expressed as a vector, is a bit of a magic number in this setting. It regards the mass of a particle whose position is represented by $$\theta$$, and momentum by $$\phi$$. See the sampling section of the Stan manual for more detail.

hmc_iteration <- function(X, y, th, epsilon, L, M) {
# Args
# epsilon: the stepsize
# L: the number of leapfrog steps
# M: a diagonal mass matrix

# initialization
M_inv = 1/M
d   = length(th)
phi = rnorm(d, 0, sqrt(M))
th_old = th

log_p_old = log_posterior(X, y, th) - .5*sum(M_inv * phi^2)

phi = phi + .5 * epsilon * gradient_theta(X, y, th)

for (l in 1:L) {
th  = th + epsilon*M_inv*phi
phi = phi + ifelse(l == L, .5, 1) * epsilon * gradient_theta(X, y, th)
}

# here we get into standard MCMC stuff, jump or not based on a draw from a
# proposal distribution
phi = -phi
log_p_star = log_posterior(X, y, th) - .5*sum(M_inv * phi^2)
r = exp(log_p_star - log_p_old)

if (is.nan(r)) r = 0

p_jump = min(r, 1)

if (runif(1) < p_jump) {
th_new = th
}
else {
th_new = th_old
}

# returns estimates and acceptance rate
list(th = th_new, p_jump = p_jump)
}

Main HMC function.

hmc_run <- function(starts, iter, warmup, epsilon_0, L_0, M, X, y) {
# # Args:
# starts:  starting values
# iter: total number of simulations for each chain (note chain is based on the dimension of starts)
# warmup: determines which of the initial iterations will be ignored for inference purposes
# epsilon0: the baseline stepsize
# L0: the baseline number of leapfrog steps
# M: is the mass vector
chains = nrow(starts)
d = ncol(starts)
sims = array(NA,
c(iter, chains, d),
dimnames = list(NULL, NULL, colnames(starts)))
p_jump = matrix(NA, iter, chains)

for (j in 1:chains) {
th = starts[j,]

for (t in 1:iter) {
epsilon = runif(1, 0, 2*epsilon_0)
L    = ceiling(2*L_0*runif(1))

temp = hmc_iteration(X, y, th, epsilon, L, M)

p_jump[t,j] = temp$p_jump sims[t,j,] = temp$th

th = temp$th } } # acceptance rate acc = round(colMeans(p_jump[(warmup + 1):iter,]), 3) message('Avg acceptance probability for each chain: ', paste0(acc,', ',acc), '\n') list(sims = sims, p_jump = p_jump) } ## Estimation With the primary functions in place, we set the starting values and choose other settings for for the HMC process. The coefficient starting values are based on random draws from a uniform distribution, while $$\sigma$$ is set to a value of one in each case. As in the other examples we’ll have 5000 total draws with warm-up set to 2500. I don’t have any thinning option here, but that could be added or simply done as part of the coda package preparation. # Starting values and mcmc settings parnames = c(paste0('beta[', 1:4, ']'), 'sigma') d = length(parnames) chains = 2 theta_start = t(replicate(chains, c(runif(d-1, -1, 1), 1))) colnames(theta_start) = parnames nsim = 1000 wu = 500 We can fiddle with these sampling parameters to get a desirable acceptance rate of around .80. The following work well with the data we have. stepsize = .08 nLeap = 10 vars = rep(1, 5) mass_vector = 1 / vars We are now ready to run the model. On my machine and with the above settings, it took a couple seconds. Once complete we can use the coda package if desired as we have done before. # Run the model fit_hmc = hmc_run( starts = theta_start, iter = nsim, warmup = wu, epsilon_0 = stepsize, L_0 = nLeap, M = mass_vector, X = X, y = y ) # str(fit_hmc, 1) Using coda, we can get some nice summary information. Results not shown. library(coda) theta = as.mcmc.list(list(as.mcmc(fit_hmc$sims[(wu+1):nsim, 1,]),
as.mcmc(fit_hmc$sims[(wu+1):nsim, 2,]))) # summary(theta) fit_summary = summary(theta)$statistics[,'Mean']

beta_est  = fit_summary[1:4]
sigma_est = fit_summary

# log_posterior(X, y, fit_summary)

Instead we can use rstan’s monitor function on fit_hmc\$sims to produce typical Stan output.

Table 1: log posterior = -301.716
parameter mean sd 2.5% 97.5% n_eff Rhat Bulk_ESS Tail_ESS
beta 4.898 0.128 4.645 5.139 505 1.006 521 305
beta 0.077 0.133 -0.192 0.334 560 1.012 566 240
beta -1.472 0.122 -1.710 -1.224 628 1.010 630 457
beta 0.825 0.111 0.612 1.042 618 1.000 630 569
sigma 2.013 0.090 1.844 2.204 682 1.004 693 425

## Comparison

Our estimates look pretty good, and inspection of the diagnostics would show good mixing and convergence as well. At this point we can compare it to the Stan output. For the following, I use the same inverse gamma prior and tweaked the control options for a little bit more similarity, but that’s not necessary.

data {                            // Data block
int<lower = 1> N;               // Sample size
int<lower = 1> K;               // Dimension of model matrix
matrix [N, K]  X;               // Model Matrix
vector[N] y;                    // Target variable
}

parameters {                      // Parameters block; declarations only
vector[K] beta;                 // Coefficient vector
real<lower = 0> sigma;          // Error scale
}

model {                           // Model block
vector[N] mu;

mu = X * beta;                  // Creation of linear predictor

// priors
beta  ~ normal(0, 10);
sigma ~ inv_gamma(.001, .001);  // changed to gamma a la code above

// likelihood
y ~ normal(mu, sigma);
}

Here are the results.

Table 2: log posterior = -1270.794
term estimate std.error conf.low conf.high
beta 1.421 3.336 -1.761 5.095
beta 1.075 0.818 -0.121 1.710
beta 0.220 1.474 -1.667 1.480
beta -0.094 0.786 -0.738 1.013
sigma 2.034 0.064 1.889 2.199

And finally, the standard least squares fit (Residual standard error = sigma).

Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.898 0.1284 38.13 3.026e-105
X1 0.08408 0.1296 0.6488 0.5171
X2 -1.469 0.1261 -11.64 3.049e-25
X3 0.8196 0.1207 6.793 8.207e-11
Fitting linear model: y ~ .
Observations Residual Std. Error $$R^2$$ Adjusted $$R^2$$
250 2.021 0.4524 0.4458