Hurdle Model
Hurdle models are applied to situations in which target data has relatively many of one value, usually zero, to go along with the other observed values. They are two-part models, a logistic model for whether an observation is zero or not, and a count model for the other part. The key distinction from the usual ‘zero-inflated’ count models, is that the count distribution does not contribute to the excess zeros. While the typical application is count data, the approach can be applied to any distribution in theory.
Poisson
Data Setup
Here we import a simple data set. The example comes from the Stata help file for zinb command. One can compare results with hnblogit command in Stata.
library(tidyverse)
fish = haven::read_dta("http://www.stata-press.com/data/r11/fish.dta")Function
The likelihood function is of two parts, one a logistic model, the other, a poisson count model.
hurdle_poisson_ll <- function(y, X, par) {
# Extract parameters
logitpars = par[grep('logit', names(par))]
poispars = par[grep('pois', names(par))]
# Logit model part
Xlogit = X
ylogit = ifelse(y == 0, 0, 1)
LPlogit = Xlogit %*% logitpars
mulogit = plogis(LPlogit)
# Calculate the likelihood
logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )
# Poisson part
Xpois = X[y > 0, ]
ypois = y[y > 0]
mupois = exp(Xpois %*% poispars)
# Calculate the likelihood
loglik0 = -mupois
loglikpois = -sum(dpois(ypois, lambda = mupois, log = TRUE)) +
sum(log(1 - exp(loglik0)))
# combine likelihoods
loglik = loglikpois + logliklogit
loglik
}Get some starting values from glm For these functions, and create a named vector for them.
init_mod = glm(
count ~ persons + livebait,
data = fish,
family = poisson,
x = TRUE,
y = TRUE
)
starts = c(logit = coef(init_mod), pois = coef(init_mod)) Estimation
Use optim. to estimate parameters. I fiddle with some options to reproduce the hurdle function as much as possible.
fit = optim(
par = starts,
fn = hurdle_poisson_ll,
X = init_mod$x,
y = init_mod$y,
control = list(maxit = 5000, reltol = 1e-12),
hessian = TRUE
)
# fitExtract the elements from the output to create a summary table.
B = fit$par
se = sqrt(diag(solve(fit$hessian)))
Z = B/se
p = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
summary_table = round(data.frame(B, se, Z, p), 3)
list(summary = summary_table, ll = fit$value)$summary
B se Z p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons 0.206 0.117 1.761 0.078
logit.livebait 0.711 0.403 1.766 0.077
pois.(Intercept) -2.057 0.341 -6.035 0.000
pois.persons 0.750 0.043 17.378 0.000
pois.livebait 1.851 0.307 6.023 0.000
$ll
[1] 882.2514
Comparison
Compare to hurdle from pscl package.
library(pscl)
fit_pscl = hurdle(
count ~ persons + livebait,
data = fish,
zero.dist = "binomial",
dist = "poisson"
)| coef | B | se | Z | p | |
|---|---|---|---|---|---|
| pscl | X.Intercept. | -2.057 | 0.341 | -6.035 | 0.000 |
| pscl | persons | 0.750 | 0.043 | 17.378 | 0.000 |
| pscl | livebait | 1.851 | 0.307 | 6.023 | 0.000 |
| pscl | X.Intercept..1 | -1.417 | 0.491 | -2.888 | 0.004 |
| pscl | persons.1 | 0.206 | 0.117 | 1.762 | 0.078 |
| pscl | livebait.1 | 0.711 | 0.403 | 1.765 | 0.077 |
| hurdle_poisson_ll | logit.(Intercept) | -1.417 | 0.491 | -2.888 | 0.004 |
| hurdle_poisson_ll | logit.persons | 0.206 | 0.117 | 1.761 | 0.078 |
| hurdle_poisson_ll | logit.livebait | 0.711 | 0.403 | 1.766 | 0.077 |
| hurdle_poisson_ll | pois.(Intercept) | -2.057 | 0.341 | -6.035 | 0.000 |
| hurdle_poisson_ll | pois.persons | 0.750 | 0.043 | 17.378 | 0.000 |
| hurdle_poisson_ll | pois.livebait | 1.851 | 0.307 | 6.023 | 0.000 |
Negative Binomial
Function
The likelihood function.
hurdle_nb_ll <- function(y, X, par) {
# Extract parameters
logitpars = par[grep('logit', names(par))]
NegBinpars = par[grep('NegBin', names(par))]
theta = exp(par[grep('theta', names(par))])
# Logit model part
Xlogit = X
ylogit = ifelse(y == 0, 0, 1)
LPlogit = Xlogit%*%logitpars
mulogit = plogis(LPlogit)
# Calculate the likelihood
logliklogit = -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )
#NB part
XNB = X[y > 0, ]
yNB = y[y > 0]
muNB = exp(XNB %*% NegBinpars)
# Calculate the likelihood
loglik0 = dnbinom(0, mu = muNB, size = theta, log = TRUE)
loglik1 = dnbinom(yNB, mu = muNB, size = theta, log = TRUE)
loglikNB = -( sum(loglik1) - sum(log(1 - exp(loglik0))) )
# combine likelihoods
loglik = loglikNB + logliklogit
loglik
}Estimation
starts = c(
logit = coef(init_mod),
NegBin = coef(init_mod),
theta = 1
)
fit_nb = optim(
par = starts,
fn = hurdle_nb_ll,
X = init_mod$x,
y = init_mod$y,
control = list(maxit = 5000, reltol = 1e-12),
method = "BFGS",
hessian = TRUE
)
# fit_nb
B = fit_nb$par
se = sqrt(diag(solve(fit_nb$hessian)))
Z = B/se
p = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
summary_table = round(data.frame(B, se, Z, p), 3)
list(summary = summary_table, ll = fit_nb$value)$summary
B se Z p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons 0.206 0.117 1.762 0.078
logit.livebait 0.711 0.403 1.765 0.077
NegBin.(Intercept) -3.461 0.869 -3.984 0.000
NegBin.persons 0.941 0.153 6.154 0.000
NegBin.livebait 1.985 0.639 3.109 0.002
theta -1.301 0.576 -2.257 0.024
$ll
[1] 439.3686
Comparison
fit_pscl = hurdle(
count ~ persons + livebait,
data = fish,
zero.dist = "binomial",
dist = "negbin"
)
# summary(fit_pscl)$coefficients
# summary_table| coef | B | se | Z | p | |
|---|---|---|---|---|---|
| pscl | X.Intercept. | -3.461 | 0.869 | -3.984 | 0.000 |
| pscl | persons | 0.941 | 0.153 | 6.154 | 0.000 |
| pscl | livebait | 1.985 | 0.639 | 3.109 | 0.002 |
| pscl | Log.theta. | -1.301 | 0.576 | -2.257 | 0.024 |
| pscl | X.Intercept..1 | -1.417 | 0.491 | -2.888 | 0.004 |
| pscl | persons.1 | 0.206 | 0.117 | 1.762 | 0.078 |
| pscl | livebait.1 | 0.711 | 0.403 | 1.765 | 0.077 |
| hurdle_nb_ll | logit.(Intercept) | -1.417 | 0.491 | -2.888 | 0.004 |
| hurdle_nb_ll | logit.persons | 0.206 | 0.117 | 1.762 | 0.078 |
| hurdle_nb_ll | logit.livebait | 0.711 | 0.403 | 1.765 | 0.077 |
| hurdle_nb_ll | NegBin.(Intercept) | -3.461 | 0.869 | -3.984 | 0.000 |
| hurdle_nb_ll | NegBin.persons | 0.941 | 0.153 | 6.154 | 0.000 |
| hurdle_nb_ll | NegBin.livebait | 1.985 | 0.639 | 3.109 | 0.002 |
| hurdle_nb_ll | theta | -1.301 | 0.576 | -2.257 | 0.024 |
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/hurdle.R