Hurdle Model

Hurdle models are applied to situations in which target data has relatively many of one value, usually zero, to go along with the other observed values. They are two-part models, a logistic model for whether an observation is zero or not, and a count model for the other part. The key distinction from the usual ‘zero-inflated’ count models, is that the count distribution does not contribute to the excess zeros. While the typical application is count data, the approach can be applied to any distribution in theory.

Poisson

Data Setup

Here we import a simple data set. The example comes from the Stata help file for zinb command. One can compare results with hnblogit command in Stata.

library(tidyverse)

fish = haven::read_dta("http://www.stata-press.com/data/r11/fish.dta")

Function

The likelihood function is of two parts, one a logistic model, the other, a poisson count model.

hurdle_poisson_ll <- function(y, X, par) {
# Extract parameters
logitpars = par[grep('logit', names(par))]
poispars  = par[grep('pois', names(par))]

# Logit model part
Xlogit = X
ylogit = ifelse(y == 0, 0, 1)

LPlogit = Xlogit %*% logitpars
mulogit = plogis(LPlogit)

# Calculate the likelihood
logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )

# Poisson part
Xpois = X[y > 0, ]
ypois = y[y > 0]

mupois = exp(Xpois %*% poispars)

# Calculate the likelihood
loglik0    = -mupois
loglikpois = -sum(dpois(ypois, lambda = mupois, log = TRUE)) +
sum(log(1 - exp(loglik0)))

# combine likelihoods
loglik = loglikpois + logliklogit
loglik
}

Get some starting values from glm For these functions, and create a named vector for them.

init_mod = glm(
count ~ persons + livebait,
data   = fish,
family = poisson,
x = TRUE,
y = TRUE
)

starts = c(logit = coef(init_mod), pois = coef(init_mod))

Estimation

Use optim. to estimate parameters. I fiddle with some options to reproduce the hurdle function as much as possible.

fit = optim(
par = starts,
fn  = hurdle_poisson_ll,
X   = init_mod$x, y = init_mod$y,
control = list(maxit = 5000, reltol = 1e-12),
hessian = TRUE
)
# fit

Extract the elements from the output to create a summary table.

B  = fit$par se = sqrt(diag(solve(fit$hessian)))
Z  = B/se
p  = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
summary_table = round(data.frame(B, se, Z, p), 3)

list(summary = summary_table, ll = fit$value)$summary
B    se      Z     p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons      0.206 0.117  1.761 0.078
logit.livebait     0.711 0.403  1.766 0.077
pois.(Intercept)  -2.057 0.341 -6.035 0.000
pois.persons       0.750 0.043 17.378 0.000
pois.livebait      1.851 0.307  6.023 0.000

$ll [1] 882.2514 Comparison Compare to hurdle from pscl package. library(pscl) fit_pscl = hurdle( count ~ persons + livebait, data = fish, zero.dist = "binomial", dist = "poisson" ) coef B se Z p pscl X.Intercept. -2.057 0.341 -6.035 0.000 pscl persons 0.750 0.043 17.378 0.000 pscl livebait 1.851 0.307 6.023 0.000 pscl X.Intercept..1 -1.417 0.491 -2.888 0.004 pscl persons.1 0.206 0.117 1.762 0.078 pscl livebait.1 0.711 0.403 1.765 0.077 hurdle_poisson_ll logit.(Intercept) -1.417 0.491 -2.888 0.004 hurdle_poisson_ll logit.persons 0.206 0.117 1.761 0.078 hurdle_poisson_ll logit.livebait 0.711 0.403 1.766 0.077 hurdle_poisson_ll pois.(Intercept) -2.057 0.341 -6.035 0.000 hurdle_poisson_ll pois.persons 0.750 0.043 17.378 0.000 hurdle_poisson_ll pois.livebait 1.851 0.307 6.023 0.000 Negative Binomial Function The likelihood function. hurdle_nb_ll <- function(y, X, par) { # Extract parameters logitpars = par[grep('logit', names(par))] NegBinpars = par[grep('NegBin', names(par))] theta = exp(par[grep('theta', names(par))]) # Logit model part Xlogit = X ylogit = ifelse(y == 0, 0, 1) LPlogit = Xlogit%*%logitpars mulogit = plogis(LPlogit) # Calculate the likelihood logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) ) #NB part XNB = X[y > 0, ] yNB = y[y > 0] muNB = exp(XNB %*% NegBinpars) # Calculate the likelihood loglik0 = dnbinom(0, mu = muNB, size = theta, log = TRUE) loglik1 = dnbinom(yNB, mu = muNB, size = theta, log = TRUE) loglikNB = -( sum(loglik1) - sum(log(1 - exp(loglik0))) ) # combine likelihoods loglik = loglikNB + logliklogit loglik } Estimation starts = c( logit = coef(init_mod), NegBin = coef(init_mod), theta = 1 ) fit_nb = optim( par = starts, fn = hurdle_nb_ll, X = init_mod$x,
y   = init_mod$y, control = list(maxit = 5000, reltol = 1e-12), method = "BFGS", hessian = TRUE ) # fit_nb B = fit_nb$par
se = sqrt(diag(solve(fit_nb$hessian))) Z = B/se p = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2) summary_table = round(data.frame(B, se, Z, p), 3) list(summary = summary_table, ll = fit_nb$value)
$summary B se Z p logit.(Intercept) -1.417 0.491 -2.888 0.004 logit.persons 0.206 0.117 1.762 0.078 logit.livebait 0.711 0.403 1.765 0.077 NegBin.(Intercept) -3.461 0.869 -3.984 0.000 NegBin.persons 0.941 0.153 6.154 0.000 NegBin.livebait 1.985 0.639 3.109 0.002 theta -1.301 0.576 -2.257 0.024$ll
[1] 439.3686

Comparison

fit_pscl = hurdle(
count ~ persons + livebait,
data = fish,
zero.dist = "binomial",
dist = "negbin"
)

# summary(fit_pscl)\$coefficients
# summary_table
coef B se Z p
pscl X.Intercept. -3.461 0.869 -3.984 0.000
pscl persons 0.941 0.153 6.154 0.000
pscl livebait 1.985 0.639 3.109 0.002
pscl Log.theta. -1.301 0.576 -2.257 0.024
pscl X.Intercept..1 -1.417 0.491 -2.888 0.004
pscl persons.1 0.206 0.117 1.762 0.078
pscl livebait.1 0.711 0.403 1.765 0.077
hurdle_nb_ll logit.(Intercept) -1.417 0.491 -2.888 0.004
hurdle_nb_ll logit.persons 0.206 0.117 1.762 0.078
hurdle_nb_ll logit.livebait 0.711 0.403 1.765 0.077
hurdle_nb_ll NegBin.(Intercept) -3.461 0.869 -3.984 0.000
hurdle_nb_ll NegBin.persons 0.941 0.153 6.154 0.000
hurdle_nb_ll NegBin.livebait 1.985 0.639 3.109 0.002
hurdle_nb_ll theta -1.301 0.576 -2.257 0.024