Nelder-Mead

This is based on the pure Python implementation by François Chollet, also found in the supplemental section. This is mostly just an academic exercise on my part. I’m not sure how much one would use the basic NM for many situations. In my experience BFGS and other approaches would be faster, more accurate, and less sensitive to starting values for the types of problems I’ve played around with. Others who actually spend their time researching such things seem to agree.

There were two issues regarding the original code on GitHub, and I’ve implemented the suggested corrections with notes. The initial R function code is not very R-like, as the goal was to keep more similar to the original Python for comparison, which used a list approach. I also provide a more R-like/cleaner version that uses matrices instead of lists, but which still sticks the same approach for the most part. For both functions, comparisons are made using the optimx package, but feel free to use base R’s optim instead.

Functions

First Version

  • f function to optimize, must return a scalar score and operate over an array of the same dimensions as x_start
  • x_start initial position
  • step look-around radius in initial step
  • no_improve_thr See no_improv_break
  • no_improv_break break after no_improv_break iterations with an improvement lower than no_improv_thr
  • max_iter always break after this number of iterations. Set it to 0 to loop indefinitely.
  • alpha parameters of the algorithm (see Wikipedia page for reference)
  • gamma parameters of the algorithm (see Wikipedia page for reference)
  • rho parameters of the algorithm (see Wikipedia page for reference)
  • sigma parameters of the algorithm (see Wikipedia page for reference)
  • verbose Print iterations?

This function returns the best parameter array and best score.

nelder_mead <- function(
  f, 
  x_start,
  step = 0.1,
  no_improve_thr  = 1e-12,
  no_improv_break = 10,
  max_iter = 0,
  alpha    = 1,
  gamma    = 2,
  rho      = 0.5,
  sigma    = 0.5,
  verbose  = FALSE
  ) {
  # init
  dim = length(x_start)
  prev_best = f(x_start)
  no_improv = 0
  res = list(list(x_start = x_start, prev_best = prev_best))
  
  
  for (i in 1:dim) {
    x = x_start
    x[i]  = x[i] + step
    score = f(x)
    res = append(res, list(list(x_start = x, prev_best = score)))
  }
  
  # simplex iter
  iters = 0
  
  while (TRUE) {
    # order
    idx  = sapply(res, `[[`, 2)
    res  = res[order(idx)]   # ascending order
    best = res[[1]][[2]]
    
    # break after max_iter
    if (max_iter > 0 & iters >= max_iter) return(res[[1]])
    iters = iters + 1
    
    # break after no_improv_break iterations with no improvement
    if (verbose) message(paste('...best so far:', best))
    
    if (best < (prev_best - no_improve_thr)) {
      no_improv = 0
      prev_best = best
    } else {
      no_improv = no_improv + 1
    }
    
    if (no_improv >= no_improv_break) return(res[[1]])
    
    # centroid
    x0 = rep(0, dim)
    for (tup in 1:(length(res)-1)) {
      for (i in 1:dim) {
        x0[i] = x0[i] + res[[tup]][[1]][i] / (length(res)-1)
      }
    }
    
   # reflection
   xr = x0 + alpha * (x0 - res[[length(res)]][[1]])
   rscore = f(xr)
   if (res[[1]][[2]] <= rscore & 
       rscore < res[[length(res)-1]][[2]]) {
     res[[length(res)]] = list(xr, rscore)
     next
   }
     
   # expansion
   if (rscore < res[[1]][[2]]) {
     # xe = x0 + gamma*(x0 - res[[length(res)]][[1]])   # issue with this
     xe = x0 + gamma * (xr - x0)   
     escore = f(xe)
     if (escore < rscore) {
       res[[length(res)]] = list(xe, escore)
       next
     } else {
       res[[length(res)]] = list(xr, rscore)
       next
     }
   }
   
   # contraction
   # xc = x0 + rho*(x0 - res[[length(res)]][[1]])  # issue with wiki consistency for rho values (and optim)
   xc = x0 + rho * (res[[length(res)]][[1]] - x0)
   cscore = f(xc)
   if (cscore < res[[length(res)]][[2]]) {
     res[[length(res)]] = list(xc, cscore)
     next
   }
   
   # reduction
   x1   = res[[1]][[1]]
   nres = list()
   for (tup in res) {
     redx  = x1 + sigma * (tup[[1]] - x1)
     score = f(redx)
     nres  = append(nres, list(list(redx, score)))
   }
   
   res = nres
  }
  
  res
}

Example

The function to minimize.

f = function(x) {
  sin(x[1]) * cos(x[2]) * (1 / (abs(x[3]) + 1))
}

Estimate.

nelder_mead(
  f, 
  c(0, 0, 0), 
  max_iter = 1000, 
  no_improve_thr = 1e-12
)
[[1]]
[1] -1.570797e+00 -2.235577e-07  1.637460e-14

[[2]]
[1] -1

Compare to optimx. You may see warnings.

optimx::optimx(
  par = c(0, 0, 0),
  fn = f,
  method = "Nelder-Mead",
  control = list(
    alpha  = 1,
    gamma  = 2,
    beta   = 0.5,
    maxit  = 1000,
    reltol = 1e-12
  )
)
                   p1           p2           p3 value fevals gevals niter convcode kkt1 kkt2 xtime
Nelder-Mead -1.570796 1.394018e-08 1.088215e-16    -1    861     NA    NA        0 TRUE TRUE     0

A Regression Model

I find a regression model to be more applicable/intuitive for my needs, so provide an example for that case.

Data Setup

library(tidyverse)

set.seed(8675309)

N = 500
n_preds = 5  # number of predictors
X = cbind(1, matrix(rnorm(N * n_preds), ncol = n_preds))
beta = runif(ncol(X), -1, 1)
y = X %*% beta + rnorm(nrow(X))

Least squares loss function to minimize.

f = function(b) {
  crossprod(y - X %*% b)[,1]  # if using optimx need scalar
}
fit_nm = nelder_mead(
  f, 
  runif(ncol(X)), 
  max_iter = 2000,
  no_improve_thr = 1e-12,
  verbose = FALSE
)

Comparison

Compare to optimx.

fit_nm_optimx = optimx::optimx(
  runif(ncol(X)),
  fn = f,  # model function
  method  = 'Nelder-Mead',
  control = list(
    alpha = 1,
    gamma = 2,
    beta  = 0.5,
    #rho
    maxit  = 2000,
    reltol = 1e-12
  )
)

We’ll add base R lm estimates.

p1 p2 p3 p4 p5 p6 value
nm_func -0.962 0.594 0.049 0.276 0.975 -0.075 501.315
nm_optimx -0.962 0.594 0.049 0.276 0.975 -0.075 501.315
lm -0.962 0.594 0.049 0.276 0.975 -0.075 501.315

Second Version

This is a more natural R approach in my opinion.

nelder_mead2 = function(
  f,
  x_start,
  step = 0.1,
  no_improve_thr  = 1e-12,
  no_improv_break = 10,
  max_iter = 0,
  alpha    = 1,
  gamma    = 2,
  rho      = 0.5,
  sigma    = 0.5,
  verbose  = FALSE
) {
  
  # init
  npar = length(x_start)
  nc = npar + 1
  prev_best = f(x_start)
  no_improv = 0
  res = matrix(c(x_start, prev_best), ncol = nc)
  colnames(res) = c(paste('par', 1:npar, sep = '_'), 'score')
  
  for (i in 1:npar) {
    x     = x_start
    x[i]  = x[i] + step
    score = f(x)
    res   = rbind(res, c(x, score))
  }
  
  # simplex iter
  iters = 0
  
  while (TRUE) {
    # order
    res  = res[order(res[, nc]), ]   # ascending order
    best = res[1, nc]
    
    # break after max_iter
    if (max_iter & iters >= max_iter) return(res[1, ])
    iters = iters + 1
    
    # break after no_improv_break iterations with no improvement
    if (verbose) message(paste('...best so far:', best))
    
    if (best < (prev_best - no_improve_thr)) {
      no_improv = 0
      prev_best = best
    } else {
      no_improv = no_improv + 1
    }
    
    if (no_improv >= no_improv_break)
      return(res[1, ])
    
    nr = nrow(res)
    
    # centroid: more efficient than previous double loop
    x0 = colMeans(res[(1:npar), -nc])
    
    # reflection
    xr = x0 + alpha * (x0 - res[nr, -nc])
    
    rscore = f(xr)
    
    if (res[1, 'score'] <= rscore & rscore < res[npar, 'score']) {
      res[nr,] = c(xr, rscore)
      next
    }
    
    # expansion
    if (rscore < res[1, 'score']) {
      xe = x0 + gamma * (xr - x0)
      escore = f(xe)
      if (escore < rscore) {
        res[nr, ] = c(xe, escore)
        next
      } else {
        res[nr, ] = c(xr, rscore)
        next
      }
    }
    
    # contraction
    xc = x0 + rho * (res[nr, -nc] - x0)
    
    cscore = f(xc)
    
    if (cscore < res[nr, 'score']) {
      res[nr,] = c(xc, cscore)
      next
    }
    
    # reduction
    x1 = res[1, -nc]
    
    nres = res
    
    for (i in 1:nr) {
      redx  = x1 + sigma * (res[i, -nc] - x1)
      score = f(redx)
      nres[i, ] = c(redx, score)
    }
    
    res = nres
  }
}

Example

The function to minimize.

f = function(x) {
  sin(x[1]) * cos(x[2]) * (1 / (abs(x[3]) + 1))
}
nelder_mead2(
  f, 
  c(0, 0, 0), 
  max_iter = 1000, 
  no_improve_thr = 1e-12
)
        par_1         par_2         par_3         score 
-1.570797e+00 -2.235577e-07  1.622809e-14 -1.000000e+00 
optimx::optimx(
  par = c(0, 0, 0), 
  fn = f, 
  method   = "Nelder-Mead",
  control  = list(
    alpha  = 1,
    gamma  = 2,
    beta   = 0.5,
    maxit  = 1000,
    reltol = 1e-12
  )
)
                   p1           p2           p3 value fevals gevals niter convcode kkt1 kkt2 xtime
Nelder-Mead -1.570796 1.394018e-08 1.088215e-16    -1    861     NA    NA        0 TRUE TRUE 0.001

A Regression Model

set.seed(8675309)

N = 500
n_preds = 5
X = cbind(1, matrix(rnorm(N * n_preds), ncol = n_preds))
beta = runif(ncol(X), -1, 1)
y = X %*% beta + rnorm(nrow(X))

Least squares loss function to minimize.

f = function(b) {
  crossprod(y - X %*% b)[,1]  # if using optimx need scalar
}
fit_nm = nelder_mead2(
  f, 
  runif(ncol(X)),
  max_iter = 2000,
  no_improve_thr = 1e-12
)

Comparison

Compare to optimx and lm as before.

fit_nm_optimx = optimx::optimx(
  runif(ncol(X)),
  fn = f,
  method   = 'Nelder-Mead',
  control  = list(
    alpha  = 1,
    gamma  = 2,
    beta   = 0.5,
    maxit  = 2000,
    reltol = 1e-12
  )
)[1:(n_preds + 1)]

fit_lm = lm.fit(X, y)$coef
nm_func nm_optimx lm truth
p1 -0.962 -0.962 -0.962 -0.909
p2 0.594 0.594 0.594 0.620
p3 0.049 0.049 0.049 0.074
p4 0.276 0.276 0.276 0.320
p5 0.975 0.975 0.975 0.956
p6 -0.075 -0.075 -0.075 -0.080