Bayesian Beta Regression

The following provides an example of beta regression using Stan/rstan, with comparison to results with R’s betareg package.

Data Setup

Several data sets from are available betareg to play with, but as they are a bit problematic in one way or another I instead focus on a simple simulated data set.

library(tidyverse)
library(betareg)

# Data for assessing the contribution of non-verbal IQ to children's reading
# skills in dyslexic and non-dyslexic children.
# issue: 30% of data has a value of .99
# data("ReadingSkills")
# ?ReadingSkills
# y = ReadingSkills$accuracy 
# 
# brmod = betareg(accuracy ~ dyslexia + iq, data = ReadingSkills)
# X = cbind(1, scale(model.matrix(brmod)[,c('dyslexia','iq')], scale=F))


# or this, issue: ignores batch effects
# data("GasolineYield")
# ?GasolineYield
#
# y = GasolineYield$yield
# X = cbind(1, scale(GasolineYield[,c('gravity','pressure','temp')]))

# yet another data option, issue: only two binary predictors
# data(WeatherTask)
# ?WeatherTask
#
# y = WeatherTask$agreement
# brmod = betareg(agreement ~ priming + eliciting, data = WeatherTask)
# X = model.matrix(brmod)

# simulated data; probably a better illustration, or at least better behaved one.
set.seed(1234)

N    = 500                    # Sample size
x_1  = rnorm(N)               # Predictors
x_2  = rnorm(N)
X    = cbind(1, x_1, x_2)
beta = c(-1, .2, -.3)
mu   = plogis(X %*% beta)  # add noise if desired + rnorm(N, sd=.01)
phi  = 10

A = mu * phi
B = (1 - mu) * phi

y = rbeta(N, A, B)

d = data.frame(x_1, x_2, y)

qplot(y, geom='density')

Model Code

data {
  int<lower=1> N;                      // sample size
  int<lower=1> K;                      // K predictors
  vector<lower=0,upper=1>[N] y;        // response 
  matrix[N,K] X;                       // predictor matrix
}

parameters {
  vector[K] theta;                     // reg coefficients
  real<lower=0> phi;                   // dispersion parameter
}

transformed parameters{
  vector[K] beta;

  beta = theta * 5;                    // same as beta ~ normal(0, 5); fairly diffuse
}

model {
  // model calculations
  vector[N] LP;                        // linear predictor
  vector[N] mu;                        // transformed linear predictor
  vector[N] A;                         // parameter for beta distn
  vector[N] B;                         // parameter for beta distn

  LP = X * beta;
  
  for (i in 1:N) { 
    mu[i] = inv_logit(LP[i]);   
  }

  A = mu * phi;
  B = (1.0 - mu) * phi;

  // priors
  theta ~ normal(0, 1);   
  phi ~ cauchy(0, 5);                  // different options for phi  
  //phi ~ inv_gamma(.001, .001);
  //phi ~ uniform(0, 500);             // put upper on phi if using this

  // likelihood
  y ~ beta(A, B);
}

generated quantities {
  vector[N] y_rep;
  
  for (i in 1:N) { 
    real mu;
    real A;
    real B;
    
    mu = inv_logit(X[i] * beta);   
    
    A = mu * phi;
    B = (1.0 - mu) * phi;
    
    y_rep[i] = beta_rng(A, B); 
  }
}

Estimation

We create a data list for Stan and estimate the model.

# Stan data list
stan_data = list(N = length(y),
                 K = ncol(X),
                 y = y,
                 X = X)

library(rstan)

fit = sampling(
  bayes_beta,
  data = stan_data,
  thin = 4,
  verbose = FALSE
)


# model for later comparison
brmod = betareg(y ~ ., data = d)

Comparison

Estimates are almost idential in this particular case.

print(
  fit,
  pars = c('beta', 'phi'),
  digits_summary = 3,
  probs = c(.025, .5, .975)
)
Inference for Stan model: 71bce272e309aa5260f24d407b92d24c.
4 chains, each with iter=2000; warmup=1000; thin=4; 
post-warmup draws per chain=250, total post-warmup draws=1000.

          mean se_mean    sd   2.5%    50%  97.5% n_eff  Rhat
beta[1] -0.988   0.001 0.030 -1.048 -0.987 -0.928   973 0.999
beta[2]  0.126   0.001 0.029  0.070  0.124  0.184   930 1.003
beta[3] -0.327   0.001 0.031 -0.387 -0.327 -0.262   930 1.003
phi     10.599   0.022 0.673  9.302 10.591 11.892   952 0.999

Samples were drawn using NUTS(diag_e) at Wed Nov 25 17:33:34 2020.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at 
convergence, Rhat=1).
summary(brmod)

Call:
betareg(formula = y ~ ., data = d)

Standardized weighted residuals 2:
    Min      1Q  Median      3Q     Max 
-4.1584 -0.5925  0.0781  0.6632  2.6250 

Coefficients (mean model with logit link):
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.98934    0.02982  -33.18  < 2e-16 ***
x_1          0.12662    0.02801    4.52 6.18e-06 ***
x_2         -0.32758    0.03049  -10.74  < 2e-16 ***

Phi coefficients (precision model with identity link):
      Estimate Std. Error z value Pr(>|z|)    
(phi)  10.6731     0.6545   16.31   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Type of estimator: ML (maximum likelihood)
Log-likelihood: 339.1 on 4 Df
Pseudo R-squared: 0.2057
Number of iterations: 12 (BFGS) + 2 (Fisher scoring) 

Visualization

Posterior predictive check.

library(bayesplot)

pp_check(
  stan_data$y,
  rstan::extract(fit, par = 'y_rep')$y_rep[1:10, ], 
  fun = 'dens_overlay'
)