This is based on the pure Python implementation by François Chollet, also found in the supplemental section. This is mostly just an academic exercise on my part. I’m not sure how much one would use the basic NM for many situations. In my experience BFGS and other approaches would be faster, more accurate, and less sensitive to starting values for the types of problems I’ve played around with. Others who actually spend their time researching such things seem to agree.

There were two issues regarding the original code on GitHub, and I’ve implemented the suggested corrections with notes. The initial R function code is not very R-like, as the goal was to keep more similar to the original Python for comparison, which used a list approach. I also provide a more R-like/cleaner version that uses matrices instead of lists, but which still sticks the same approach for the most part. For both functions, comparisons are made using the optimx package, but feel free to use base R’s optim instead.

## Functions

### First Version

• f function to optimize, must return a scalar score and operate over an array of the same dimensions as x_start
• x_start initial position
• step look-around radius in initial step
• no_improve_thr See no_improv_break
• no_improv_break break after no_improv_break iterations with an improvement lower than no_improv_thr
• max_iter always break after this number of iterations. Set it to 0 to loop indefinitely.
• alpha parameters of the algorithm (see Wikipedia page for reference)
• gamma parameters of the algorithm (see Wikipedia page for reference)
• rho parameters of the algorithm (see Wikipedia page for reference)
• sigma parameters of the algorithm (see Wikipedia page for reference)
• verbose Print iterations?

This function returns the best parameter array and best score.

nelder_mead <- function(
f,
x_start,
step = 0.1,
no_improve_thr  = 1e-12,
no_improv_break = 10,
max_iter = 0,
alpha    = 1,
gamma    = 2,
rho      = 0.5,
sigma    = 0.5,
verbose  = FALSE
) {
# init
dim = length(x_start)
prev_best = f(x_start)
no_improv = 0
res = list(list(x_start = x_start, prev_best = prev_best))

for (i in 1:dim) {
x = x_start
x[i]  = x[i] + step
score = f(x)
res = append(res, list(list(x_start = x, prev_best = score)))
}

# simplex iter
iters = 0

while (TRUE) {
# order
idx  = sapply(res, [[, 2)
res  = res[order(idx)]   # ascending order
best = res[[1]][[2]]

# break after max_iter
if (max_iter > 0 & iters >= max_iter) return(res[[1]])
iters = iters + 1

# break after no_improv_break iterations with no improvement
if (verbose) message(paste('...best so far:', best))

if (best < (prev_best - no_improve_thr)) {
no_improv = 0
prev_best = best
} else {
no_improv = no_improv + 1
}

if (no_improv >= no_improv_break) return(res[[1]])

# centroid
x0 = rep(0, dim)
for (tup in 1:(length(res)-1)) {
for (i in 1:dim) {
x0[i] = x0[i] + res[[tup]][[1]][i] / (length(res)-1)
}
}

# reflection
xr = x0 + alpha * (x0 - res[[length(res)]][[1]])
rscore = f(xr)
if (res[[1]][[2]] <= rscore &
rscore < res[[length(res)-1]][[2]]) {
res[[length(res)]] = list(xr, rscore)
next
}

# expansion
if (rscore < res[[1]][[2]]) {
# xe = x0 + gamma*(x0 - res[[length(res)]][[1]])   # issue with this
xe = x0 + gamma * (xr - x0)
escore = f(xe)
if (escore < rscore) {
res[[length(res)]] = list(xe, escore)
next
} else {
res[[length(res)]] = list(xr, rscore)
next
}
}

# contraction
# xc = x0 + rho*(x0 - res[[length(res)]][[1]])  # issue with wiki consistency for rho values (and optim)
xc = x0 + rho * (res[[length(res)]][[1]] - x0)
cscore = f(xc)
if (cscore < res[[length(res)]][[2]]) {
res[[length(res)]] = list(xc, cscore)
next
}

# reduction
x1   = res[[1]][[1]]
nres = list()
for (tup in res) {
redx  = x1 + sigma * (tup[[1]] - x1)
score = f(redx)
nres  = append(nres, list(list(redx, score)))
}

res = nres
}

res
}

### Example

The function to minimize.

f = function(x) {
sin(x[1]) * cos(x[2]) * (1 / (abs(x[3]) + 1))
}

Estimate.

nelder_mead(
f,
c(0, 0, 0),
max_iter = 1000,
no_improve_thr = 1e-12
)
[[1]]
[1] -1.570797e+00 -2.235577e-07  1.637460e-14

[[2]]
[1] -1

Compare to optimx. You may see warnings.

optimx::optimx(
par = c(0, 0, 0),
fn = f,
control = list(
alpha  = 1,
gamma  = 2,
beta   = 0.5,
maxit  = 1000,
reltol = 1e-12
)
)
                   p1           p2           p3 value fevals gevals niter convcode kkt1 kkt2 xtime
Nelder-Mead -1.570796 1.394018e-08 1.088215e-16    -1    861     NA    NA        0 TRUE TRUE     0

### A Regression Model

I find a regression model to be more applicable/intuitive for my needs, so provide an example for that case.

#### Data Setup

library(tidyverse)

set.seed(8675309)

N = 500
n_preds = 5  # number of predictors
X = cbind(1, matrix(rnorm(N * n_preds), ncol = n_preds))
beta = runif(ncol(X), -1, 1)
y = X %*% beta + rnorm(nrow(X))

Least squares loss function to minimize.

f = function(b) {
crossprod(y - X %*% b)[,1]  # if using optimx need scalar
}
fit_nm = nelder_mead(
f,
runif(ncol(X)),
max_iter = 2000,
no_improve_thr = 1e-12,
verbose = FALSE
)

#### Comparison

Compare to optimx.

fit_nm_optimx = optimx::optimx(
runif(ncol(X)),
fn = f,  # model function
control = list(
alpha = 1,
gamma = 2,
beta  = 0.5,
#rho
maxit  = 2000,
reltol = 1e-12
)
)

We’ll add base R lm estimates.

p1 p2 p3 p4 p5 p6 value
nm_func -0.962 0.594 0.049 0.276 0.975 -0.075 501.315
nm_optimx -0.962 0.594 0.049 0.276 0.975 -0.075 501.315
lm -0.962 0.594 0.049 0.276 0.975 -0.075 501.315

## Second Version

This is a more natural R approach in my opinion.

nelder_mead2 = function(
f,
x_start,
step = 0.1,
no_improve_thr  = 1e-12,
no_improv_break = 10,
max_iter = 0,
alpha    = 1,
gamma    = 2,
rho      = 0.5,
sigma    = 0.5,
verbose  = FALSE
) {

# init
npar = length(x_start)
nc = npar + 1
prev_best = f(x_start)
no_improv = 0
res = matrix(c(x_start, prev_best), ncol = nc)
colnames(res) = c(paste('par', 1:npar, sep = '_'), 'score')

for (i in 1:npar) {
x     = x_start
x[i]  = x[i] + step
score = f(x)
res   = rbind(res, c(x, score))
}

# simplex iter
iters = 0

while (TRUE) {
# order
res  = res[order(res[, nc]), ]   # ascending order
best = res[1, nc]

# break after max_iter
if (max_iter & iters >= max_iter) return(res[1, ])
iters = iters + 1

# break after no_improv_break iterations with no improvement
if (verbose) message(paste('...best so far:', best))

if (best < (prev_best - no_improve_thr)) {
no_improv = 0
prev_best = best
} else {
no_improv = no_improv + 1
}

if (no_improv >= no_improv_break)
return(res[1, ])

nr = nrow(res)

# centroid: more efficient than previous double loop
x0 = colMeans(res[(1:npar), -nc])

# reflection
xr = x0 + alpha * (x0 - res[nr, -nc])

rscore = f(xr)

if (res[1, 'score'] <= rscore & rscore < res[npar, 'score']) {
res[nr,] = c(xr, rscore)
next
}

# expansion
if (rscore < res[1, 'score']) {
xe = x0 + gamma * (xr - x0)
escore = f(xe)
if (escore < rscore) {
res[nr, ] = c(xe, escore)
next
} else {
res[nr, ] = c(xr, rscore)
next
}
}

# contraction
xc = x0 + rho * (res[nr, -nc] - x0)

cscore = f(xc)

if (cscore < res[nr, 'score']) {
res[nr,] = c(xc, cscore)
next
}

# reduction
x1 = res[1, -nc]

nres = res

for (i in 1:nr) {
redx  = x1 + sigma * (res[i, -nc] - x1)
score = f(redx)
nres[i, ] = c(redx, score)
}

res = nres
}
}

### Example

The function to minimize.

f = function(x) {
sin(x[1]) * cos(x[2]) * (1 / (abs(x[3]) + 1))
}
nelder_mead2(
f,
c(0, 0, 0),
max_iter = 1000,
no_improve_thr = 1e-12
)
        par_1         par_2         par_3         score
-1.570797e+00 -2.235577e-07  1.622809e-14 -1.000000e+00 
optimx::optimx(
par = c(0, 0, 0),
fn = f,
control  = list(
alpha  = 1,
gamma  = 2,
beta   = 0.5,
maxit  = 1000,
reltol = 1e-12
)
)
                   p1           p2           p3 value fevals gevals niter convcode kkt1 kkt2 xtime
Nelder-Mead -1.570796 1.394018e-08 1.088215e-16    -1    861     NA    NA        0 TRUE TRUE 0.001

### A Regression Model

set.seed(8675309)

N = 500
n_preds = 5
X = cbind(1, matrix(rnorm(N * n_preds), ncol = n_preds))
beta = runif(ncol(X), -1, 1)
y = X %*% beta + rnorm(nrow(X))

Least squares loss function to minimize.

f = function(b) {
crossprod(y - X %*% b)[,1]  # if using optimx need scalar
}
fit_nm = nelder_mead2(
f,
runif(ncol(X)),
max_iter = 2000,
no_improve_thr = 1e-12
)

#### Comparison

Compare to optimx and lm as before.

fit_nm_optimx = optimx::optimx(
runif(ncol(X)),
fn = f,
control  = list(
alpha  = 1,
gamma  = 2,
beta   = 0.5,
maxit  = 2000,
reltol = 1e-12
)
)[1:(n_preds + 1)]

fit_lm = lm.fit(X, y)\$coef
nm_func nm_optimx lm truth
p1 -0.962 -0.962 -0.962 -0.909
p2 0.594 0.594 0.594 0.620
p3 0.049 0.049 0.049 0.074
p4 0.276 0.276 0.276 0.320
p5 0.975 0.975 0.975 0.956
p6 -0.075 -0.075 -0.075 -0.080