Newton and IRLS

Here we demonstrate Newton’s and Iterated Reweighted Least Squares approaches with a logistic regression model.

For the following, I had Murphy’s PML text open and more or less followed the algorithms in chapter 8. Note that for Newton’s method, this doesn’t implement a line search to find a more optimal stepsize at a given iteration.

Data Setup

Predict graduate school admission based on gre, gpa, and school rank (higher is more prestige). See corresponding demo here. The only difference is that I treat rank as numeric rather than categorical. We will be comparing results to base R glm function, so I will use it to create the model data.

library(tidyverse)

admit = haven::read_dta('https://stats.idre.ucla.edu/stat/stata/dae/binary.dta')

fit_glm = glm(admit ~ gre + gpa + rank, data = admit, family = binomial)

# summary(fit_glm)

X = model.matrix(fit_glm)
y = fit_glm$y

Functions

Newton’s Method

newton <- function(
  X,
  y,
  tol  = 1e-12,
  iter = 500,
  stepsize = .5
  ) {
  
  # Args: 
  # X: model matrix
  # y: target
  # tol: tolerance
  # iter: maximum number of iterations
  # stepsize: (0, 1)
  
  # intialize
  int     = log(mean(y) / (1 - mean(y)))         # intercept
  beta    = c(int, rep(0, ncol(X) - 1))
  currtol = 1
  it = 0
  ll = 0
  
  while (currtol > tol && it < iter) {
    it = it +1
    ll_old = ll
    
    mu = plogis(X %*% beta)[,1]
    g  = crossprod(X, mu-y)               # gradient
    S  = diag(mu*(1-mu)) 
    H  = t(X) %*% S %*% X                 # hessian
    beta = beta - stepsize * solve(H) %*% g

    ll = sum(dbinom(y, prob = mu, size = 1, log = TRUE))
    currtol = abs(ll - ll_old)
  }
  
  list(
    beta = beta,
    iter = it,
    tol  = currtol,
    loglik = ll
  )
}

IRLS

Note that glm is actually using IRLS, so the results from this should be fairly spot on.

irls <- function(X, y, tol = 1e-12, iter = 500) {
  
  # intialize
  int  = log(mean(y) / (1 - mean(y)))   # intercept
  beta = c(int, rep(0, ncol(X) - 1))
  currtol = 1
  it = 0
  ll = 0
  
  while (currtol > tol && it < iter) {
    it = it + 1
    ll_old = ll
    
    eta  = X %*% beta
    mu   = plogis(eta)[,1]
    s    = mu * (1 - mu)
    S    = diag(s)
    z    = eta + (y - mu)/s
    beta = solve(t(X) %*% S %*% X) %*% (t(X) %*% (S %*% z))
    
    ll = sum(
      dbinom(
        y,
        prob = plogis(X %*% beta),
        size = 1,
        log  = TRUE
      )
    )
    
    currtol = abs(ll - ll_old)
  }
  
  list(
    beta = beta,
    iter = it,
    tol  = currtol,
    loglik  = ll,
    weights = plogis(X %*% beta) * (1 - plogis(X %*% beta))
  )
}

Estimation

fit_newton = newton(
  X = X,
  y = y,
  stepsize = .9,
  tol = 1e-8      # tol set to 1e-8 as in glm default
) 

fit_newton
$beta
                    [,1]
(Intercept) -3.449548577
gre          0.002293959
gpa          0.777013649
rank        -0.560031371

$iter
[1] 8

$tol
[1] 2.581544e-10

$loglik
[1] -229.7209
# fit_glm

tol set to 1e-8 as in glm default.

irls_result = irls(X = X, y = y, tol = 1e-8) 

str(irls_result)
List of 5
 $ beta   : num [1:4, 1] -3.44955 0.00229 0.77701 -0.56003
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:4] "(Intercept)" "gre" "gpa" "rank"
  .. ..$ : NULL
 $ iter   : num 4
 $ tol    : num 6e-09
 $ loglik : num -230
 $ weights: num [1:400, 1] 0.1536 0.2168 0.2026 0.1268 0.0884 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:400] "1" "2" "3" "4" ...
  .. ..$ : NULL
# fit_glm

Comparison

Compare all results.

beta1 beta2 beta3 beta4 iter tol loglik
newton -3.45 0.002 0.777 -0.56 8 0 -229.721
irls -3.45 0.002 0.777 -0.56 4 0 -229.721
glm_default -3.45 0.002 0.777 -0.56 4 NA -229.721

Compare weights between the irls and glm results.

head(cbind(irls_result$weights, fit_glm$weights))
        [,1]       [,2]
1 0.15362250 0.15362250
2 0.21679615 0.21679615
3 0.20255723 0.20255724
4 0.12676333 0.12676334
5 0.08835918 0.08835918
6 0.23528108 0.23528108