Hurdle Model
Hurdle models are applied to situations in which target data has relatively many of one value, usually zero, to go along with the other observed values. They are two-part models, a logistic model for whether an observation is zero or not, and a count model for the other part. The key distinction from the usual ‘zero-inflated’ count models, is that the count distribution does not contribute to the excess zeros. While the typical application is count data, the approach can be applied to any distribution in theory.
Poisson
Data Setup
Here we import a simple data set. The example comes from the Stata help file for zinb command. One can compare results with hnblogit
command in Stata.
library(tidyverse)
= haven::read_dta("http://www.stata-press.com/data/r11/fish.dta") fish
Function
The likelihood function is of two parts, one a logistic model, the other, a poisson count model.
<- function(y, X, par) {
hurdle_poisson_ll # Extract parameters
= par[grep('logit', names(par))]
logitpars = par[grep('pois', names(par))]
poispars
# Logit model part
= X
Xlogit = ifelse(y == 0, 0, 1)
ylogit
= Xlogit %*% logitpars
LPlogit = plogis(LPlogit)
mulogit
# Calculate the likelihood
= -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )
logliklogit
# Poisson part
= X[y > 0, ]
Xpois = y[y > 0]
ypois
= exp(Xpois %*% poispars)
mupois
# Calculate the likelihood
= -mupois
loglik0 = -sum(dpois(ypois, lambda = mupois, log = TRUE)) +
loglikpois sum(log(1 - exp(loglik0)))
# combine likelihoods
= loglikpois + logliklogit
loglik
loglik }
Get some starting values from glm For these functions, and create a named vector for them.
= glm(
init_mod ~ persons + livebait,
count data = fish,
family = poisson,
x = TRUE,
y = TRUE
)
= c(logit = coef(init_mod), pois = coef(init_mod)) starts
Estimation
Use optim. to estimate parameters. I fiddle with some options to reproduce the hurdle function as much as possible.
= optim(
fit par = starts,
fn = hurdle_poisson_ll,
X = init_mod$x,
y = init_mod$y,
control = list(maxit = 5000, reltol = 1e-12),
hessian = TRUE
)# fit
Extract the elements from the output to create a summary table.
= fit$par
B = sqrt(diag(solve(fit$hessian)))
se = B/se
Z = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
p = round(data.frame(B, se, Z, p), 3)
summary_table
list(summary = summary_table, ll = fit$value)
$summary
B se Z p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons 0.206 0.117 1.761 0.078
logit.livebait 0.711 0.403 1.766 0.077
pois.(Intercept) -2.057 0.341 -6.035 0.000
pois.persons 0.750 0.043 17.378 0.000
pois.livebait 1.851 0.307 6.023 0.000
$ll
[1] 882.2514
Comparison
Compare to hurdle from pscl package.
library(pscl)
= hurdle(
fit_pscl ~ persons + livebait,
count data = fish,
zero.dist = "binomial",
dist = "poisson"
)
coef | B | se | Z | p | |
---|---|---|---|---|---|
pscl | X.Intercept. | -2.057 | 0.341 | -6.035 | 0.000 |
pscl | persons | 0.750 | 0.043 | 17.378 | 0.000 |
pscl | livebait | 1.851 | 0.307 | 6.023 | 0.000 |
pscl | X.Intercept..1 | -1.417 | 0.491 | -2.888 | 0.004 |
pscl | persons.1 | 0.206 | 0.117 | 1.762 | 0.078 |
pscl | livebait.1 | 0.711 | 0.403 | 1.765 | 0.077 |
hurdle_poisson_ll | logit.(Intercept) | -1.417 | 0.491 | -2.888 | 0.004 |
hurdle_poisson_ll | logit.persons | 0.206 | 0.117 | 1.761 | 0.078 |
hurdle_poisson_ll | logit.livebait | 0.711 | 0.403 | 1.766 | 0.077 |
hurdle_poisson_ll | pois.(Intercept) | -2.057 | 0.341 | -6.035 | 0.000 |
hurdle_poisson_ll | pois.persons | 0.750 | 0.043 | 17.378 | 0.000 |
hurdle_poisson_ll | pois.livebait | 1.851 | 0.307 | 6.023 | 0.000 |
Negative Binomial
Function
The likelihood function.
<- function(y, X, par) {
hurdle_nb_ll # Extract parameters
= par[grep('logit', names(par))]
logitpars = par[grep('NegBin', names(par))]
NegBinpars
= exp(par[grep('theta', names(par))])
theta
# Logit model part
= X
Xlogit = ifelse(y == 0, 0, 1)
ylogit
= Xlogit%*%logitpars
LPlogit = plogis(LPlogit)
mulogit
# Calculate the likelihood
= -sum( ylogit*log(mulogit) + (1 - ylogit)*log(1 - mulogit) )
logliklogit
#NB part
= X[y > 0, ]
XNB = y[y > 0]
yNB
= exp(XNB %*% NegBinpars)
muNB
# Calculate the likelihood
= dnbinom(0, mu = muNB, size = theta, log = TRUE)
loglik0 = dnbinom(yNB, mu = muNB, size = theta, log = TRUE)
loglik1 = -( sum(loglik1) - sum(log(1 - exp(loglik0))) )
loglikNB
# combine likelihoods
= loglikNB + logliklogit
loglik
loglik }
Estimation
= c(
starts logit = coef(init_mod),
NegBin = coef(init_mod),
theta = 1
)
= optim(
fit_nb par = starts,
fn = hurdle_nb_ll,
X = init_mod$x,
y = init_mod$y,
control = list(maxit = 5000, reltol = 1e-12),
method = "BFGS",
hessian = TRUE
)# fit_nb
= fit_nb$par
B = sqrt(diag(solve(fit_nb$hessian)))
se = B/se
Z = ifelse(Z >= 0, pnorm(Z, lower = FALSE)*2, pnorm(Z)*2)
p
= round(data.frame(B, se, Z, p), 3)
summary_table
list(summary = summary_table, ll = fit_nb$value)
$summary
B se Z p
logit.(Intercept) -1.417 0.491 -2.888 0.004
logit.persons 0.206 0.117 1.762 0.078
logit.livebait 0.711 0.403 1.765 0.077
NegBin.(Intercept) -3.461 0.869 -3.984 0.000
NegBin.persons 0.941 0.153 6.154 0.000
NegBin.livebait 1.985 0.639 3.109 0.002
theta -1.301 0.576 -2.257 0.024
$ll
[1] 439.3686
Comparison
= hurdle(
fit_pscl ~ persons + livebait,
count data = fish,
zero.dist = "binomial",
dist = "negbin"
)
# summary(fit_pscl)$coefficients
# summary_table
coef | B | se | Z | p | |
---|---|---|---|---|---|
pscl | X.Intercept. | -3.461 | 0.869 | -3.984 | 0.000 |
pscl | persons | 0.941 | 0.153 | 6.154 | 0.000 |
pscl | livebait | 1.985 | 0.639 | 3.109 | 0.002 |
pscl | Log.theta. | -1.301 | 0.576 | -2.257 | 0.024 |
pscl | X.Intercept..1 | -1.417 | 0.491 | -2.888 | 0.004 |
pscl | persons.1 | 0.206 | 0.117 | 1.762 | 0.078 |
pscl | livebait.1 | 0.711 | 0.403 | 1.765 | 0.077 |
hurdle_nb_ll | logit.(Intercept) | -1.417 | 0.491 | -2.888 | 0.004 |
hurdle_nb_ll | logit.persons | 0.206 | 0.117 | 1.762 | 0.078 |
hurdle_nb_ll | logit.livebait | 0.711 | 0.403 | 1.765 | 0.077 |
hurdle_nb_ll | NegBin.(Intercept) | -3.461 | 0.869 | -3.984 | 0.000 |
hurdle_nb_ll | NegBin.persons | 0.941 | 0.153 | 6.154 | 0.000 |
hurdle_nb_ll | NegBin.livebait | 1.985 | 0.639 | 3.109 | 0.002 |
hurdle_nb_ll | theta | -1.301 | 0.576 | -2.257 | 0.024 |
Source
Original code available at https://github.com/m-clark/Miscellaneous-R-Code/blob/master/ModelFitting/hurdle.R