Hurdle Model

Hurdle models are applied to situations in which target data has relatively many of one value, usually zero, to go along with the other observed values. They are two-part models, a logistic model for whether an observation is zero or not, and a count model for the other part. The key distinction from the usual ‘zero-inflated’ count models, is that the count distribution does not contribute to the excess zeros. While the typical application is count data, the approach can be applied to any distribution in theory.

Poisson

Data Setup

Here we import a simple data set. The example comes from the Stata help file for zinb command. One can compare results with hnblogit command in Stata.

library(tidyverse)

fish = haven::read_dta("http://www.stata-press.com/data/r11/fish.dta")

Function

The likelihood function is of two parts, one a logistic model, the other, a poisson count model.

hurdle_poisson_ll <- function(y, X, par) {
  # Extract parameters
  logitpars = par[grep('logit', names(par))]
  poispars  = par[grep('pois', names(par))]
  
  # Logit model part
  Xlogit = X
  ylogit = ifelse(y == 0, 0, 1)
  
  LPlogit = Xlogit %*% logitpars
  mulogit = plogis(LPlogit)
  
  # Calculate the likelihood
  logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )  
  
  # Poisson part
  Xpois = X[y > 0, ]
  ypois = y[y > 0]
  
  mupois = exp(Xpois %*% poispars)
  
  # Calculate the likelihood
  loglik0    = -mupois
  loglikpois = -sum(dpois(ypois, lambda = mupois, log = TRUE)) + 
    sum(log(1 - exp(loglik0)))
  
  # combine likelihoods
  loglik = loglikpois + logliklogit
  loglik
}

Get some starting values from glm For these functions, and create a named vector for them.

init_mod = glm(
  count ~ persons + livebait,
  data   = fish,
  family = poisson,
  x = TRUE,
  y = TRUE
)

starts = c(logit = coef(init_mod), pois = coef(init_mod))  

Estimation

Use optim. to estimate parameters. I fiddle with some options to reproduce the hurdle function as much as possible.

fit = optim(
  par = starts,
  fn  = hurdle_poisson_ll,
  X   = init_mod$x,
  y   = init_mod$y,
  control = list(maxit = 5000, reltol = 1e-12),
  hessian = TRUE
)
# fit

Extract the elements from the output to create a summary table.

B  = fit$par
se = sqrt(diag(solve(fit$hessian)))
Z  = B/se
p  = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
summary_table = round(data.frame(B, se, Z, p), 3)

list(summary = summary_table, ll = fit$value)
$summary
                       B    se      Z     p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons      0.206 0.117  1.761 0.078
logit.livebait     0.711 0.403  1.766 0.077
pois.(Intercept)  -2.057 0.341 -6.035 0.000
pois.persons       0.750 0.043 17.378 0.000
pois.livebait      1.851 0.307  6.023 0.000

$ll
[1] 882.2514

Comparison

Compare to hurdle from pscl package.

library(pscl)

fit_pscl = hurdle(
  count ~ persons + livebait,
  data = fish,
  zero.dist = "binomial",
  dist = "poisson"
)
coef B se Z p
pscl X.Intercept. -2.057 0.341 -6.035 0.000
pscl persons 0.750 0.043 17.378 0.000
pscl livebait 1.851 0.307 6.023 0.000
pscl X.Intercept..1 -1.417 0.491 -2.888 0.004
pscl persons.1 0.206 0.117 1.762 0.078
pscl livebait.1 0.711 0.403 1.765 0.077
hurdle_poisson_ll logit.(Intercept) -1.417 0.491 -2.888 0.004
hurdle_poisson_ll logit.persons 0.206 0.117 1.761 0.078
hurdle_poisson_ll logit.livebait 0.711 0.403 1.766 0.077
hurdle_poisson_ll pois.(Intercept) -2.057 0.341 -6.035 0.000
hurdle_poisson_ll pois.persons 0.750 0.043 17.378 0.000
hurdle_poisson_ll pois.livebait 1.851 0.307 6.023 0.000

Negative Binomial

Function

The likelihood function.

hurdle_nb_ll <- function(y, X, par) {
  # Extract parameters
  logitpars  = par[grep('logit', names(par))]
  NegBinpars = par[grep('NegBin', names(par))]
  
  theta = exp(par[grep('theta', names(par))])
  
  # Logit model part
  Xlogit = X
  ylogit = ifelse(y == 0, 0, 1)
  
  LPlogit = Xlogit%*%logitpars
  mulogit =  plogis(LPlogit)
  
  # Calculate the likelihood
  logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )
  
  #NB part
  XNB = X[y > 0, ]
  yNB = y[y > 0]
  
  muNB = exp(XNB %*% NegBinpars)
  
  # Calculate the likelihood
  loglik0  = dnbinom(0,   mu = muNB, size = theta, log = TRUE)
  loglik1  = dnbinom(yNB, mu = muNB, size = theta, log = TRUE)
  loglikNB = -( sum(loglik1) - sum(log(1 - exp(loglik0))) )
  
  # combine likelihoods
  loglik = loglikNB + logliklogit
  loglik
}

Estimation

starts =  c(
  logit  = coef(init_mod),
  NegBin = coef(init_mod),
  theta  = 1
)

fit_nb = optim(
  par = starts,
  fn  = hurdle_nb_ll,
  X   = init_mod$x,
  y   = init_mod$y,
  control = list(maxit = 5000, reltol = 1e-12),
  method  = "BFGS",
  hessian = TRUE
)
# fit_nb 

B  = fit_nb$par
se = sqrt(diag(solve(fit_nb$hessian)))
Z  = B/se
p  = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)

summary_table = round(data.frame(B, se, Z, p), 3)

list(summary = summary_table, ll = fit_nb$value)
$summary
                        B    se      Z     p
logit.(Intercept)  -1.417 0.491 -2.888 0.004
logit.persons       0.206 0.117  1.762 0.078
logit.livebait      0.711 0.403  1.765 0.077
NegBin.(Intercept) -3.461 0.869 -3.984 0.000
NegBin.persons      0.941 0.153  6.154 0.000
NegBin.livebait     1.985 0.639  3.109 0.002
theta              -1.301 0.576 -2.257 0.024

$ll
[1] 439.3686

Comparison

fit_pscl = hurdle(
  count ~ persons + livebait,
  data = fish,
  zero.dist = "binomial",
  dist = "negbin"
)

# summary(fit_pscl)$coefficients
# summary_table
coef B se Z p
pscl X.Intercept. -3.461 0.869 -3.984 0.000
pscl persons 0.941 0.153 6.154 0.000
pscl livebait 1.985 0.639 3.109 0.002
pscl Log.theta. -1.301 0.576 -2.257 0.024
pscl X.Intercept..1 -1.417 0.491 -2.888 0.004
pscl persons.1 0.206 0.117 1.762 0.078
pscl livebait.1 0.711 0.403 1.765 0.077
hurdle_nb_ll logit.(Intercept) -1.417 0.491 -2.888 0.004
hurdle_nb_ll logit.persons 0.206 0.117 1.762 0.078
hurdle_nb_ll logit.livebait 0.711 0.403 1.765 0.077
hurdle_nb_ll NegBin.(Intercept) -3.461 0.869 -3.984 0.000
hurdle_nb_ll NegBin.persons 0.941 0.153 6.154 0.000
hurdle_nb_ll NegBin.livebait 1.985 0.639 3.109 0.002
hurdle_nb_ll theta -1.301 0.576 -2.257 0.024