# Reproducing Kernel Hilbert Space Regression

This R code is based on Reproducing Kernel Hilbert Spaces for Penalized Regression: A tutorial, Nosedal-Sanchez et al. (2010), specifically, their code in the supplemental section. The original code had several issues as far as general R programming practices, and eventually appears to have been replaced in publication at some point, as did most of the corresponding supplemental text. I can no longer locate the original, so now follow the published code. The original data I was following was also replaced by the longley and mcycle data sets.

To start, we will use an example for ridge regression, followed by a spline example.

## Data Setup

library(tidyverse)

data(longley)    # avaiable in base R

y = longley[,1]
X = as.matrix(longley[,2:7])
X = apply(X, 2, scales::rescale, to = c(0, 1))

## Functions

### Function to find the inverse of a matrix

We can use base::solve, but this function avoids a computationally singular result.

inverse <- function(X, eps = 1e-12) {
eig.X = eigen(X, symmetric = TRUE)
P = eig.X[]
lambda = eig.X[]

ind = lambda > eps

lambda[ind]  = 1/lambda[ind]
lambda[!ind] = 0

P %*% diag(lambda) %*% t(P)
}

### Reproducing Kernel

rk <- function(s, t) {
init_len = length(s)
rk = 0

for (i in 1:init_len)
rk = s[i]*t[i] + rk

rk
} 

### Gram matrix

For the first example involving ridge regression, the gram function just produces tcrossprod(X). I generalize it in case a different kernel is desired, and add that as an additional argument. This will avoid having to redo the function later.

gram <- function(X, rkfunc = rk) {
apply(X, 1, function(Row)
apply(X, 1, function(tRow) rkfunc(Row, tRow))
)
}

### Ridge regression

ridge <- function(X, y, lambda) {
Gramm = gram(X)                                 # Gramm matrix (nxn)

n = length(y)
Q = cbind(1, Gramm)                             # design matrix
S = rbind(0, cbind(0, Gramm))

M = crossprod(Q) + lambda*S
M_inv = inverse(M)                              # inverse of M

gamma_hat = crossprod(M_inv, crossprod(Q, y))
f_hat = Q %*% gamma_hat

A = Q %*% M_inv %*% t(Q)
tr_A = sum(diag(A))                             # trace of hat matrix

rss  = crossprod(y - f_hat)                     # residual sum of squares
gcv  = n*rss / (n - tr_A)^2                     # obtain GCV score

list(
f_hat = f_hat,
gamma_hat = gamma_hat,
beta_hat = c(gamma_hat, crossprod(gamma_hat[-1], X)),
gcv = gcv
)
}

## Estimation

A simple direct search for the GCV optimal smoothing parameter can be made as follows:

lambda = 10^seq(-6, 0, by = .1)

gcv_search = map(lambda, function(lam) ridge(X, y, lam))

V = map_dbl(gcv_search, function(x) x$gcv) ridge_coefs = map_df( gcv_search, function(x) data.frame( value = x$beta_hat[-1],
coef  = colnames(X)
),
.id = 'iter'
) %>%
mutate(lambda = lambda[as.integer(iter)])

Compare with Figure 3 in the article.

gcv_plot = qplot(
lambda,
V,
geom = 'line',
main = 'GCV score',
ylab = 'GCV'
) +
scale_x_log10()

beta_plot = ridge_coefs %>%
ggplot(aes(x = lambda, y = value, color = coef)) +
geom_line() +
scale_x_log10() +
scico::scale_color_scico_d(end = .8) +
labs(title = 'Betas Across Lambda') Pick the best model and obtain the estimates.

fit_ridge = ridge(X, y, lambda[which.min(V)])  # fit optimal model

gamma_hat = fit_ridge$gamma_hat beta_0 = fit_ridge$gamma_hat             # intercept
beta_hat  = crossprod(gamma_hat[-1,], X)         # slope and noise term coefficients

## Comparison

I add a comparison to glmnet, where setting alpha = 0 is equivalent to ridge regression.

c(beta_0, beta_hat)
  82.7840043  54.1683427   5.3640251   1.3781910 -28.7948627   5.3956341  -0.6095799
fit_glmnet = glmnet::glmnet(
X,
y,
alpha  = 0,
lambda = lambda,
standardize = FALSE
)
(Intercept) GNP Unemployed Armed.Forces Population Year Employed
fit_ridge 82.784 54.168 5.364 1.378 -28.795 5.396 -0.610
fit_glmnet 82.329 69.783 6.963 1.245 -37.323 -1.497 -1.606

## Example: Cubic Spline

### Data Setup

For this example we’ll use the MASS::mcycle data.

x = as.matrix(MASS::mcycle$times) x = scales::rescale(x, to = c(0, 1)) # rescale predictor to [0,1] y = MASS::mcycle$accel

### Functions

#### Reproducing Kernel

rk_spline <- function(s, t) {
return(.5 * min(s, t)^2 * max(s, t) - (1/6) * min(s, t)^3)
}

No need to redo the gram function do to previous change that accepts the kernel as an argument

#### Smoothing Spline

smoothing_spline <- function(X, y, lambda) {
Gramm = gram(X, rkfunc = rk_spline)   # Gramm matrix (nxn)

n = length(y)
J = cbind(1, X)                       # matrix with a basis for the null space of the penalty
Q = cbind(J, Gramm)                   # design matrix
m = ncol(J)                           # dimension of the null space of the penalty

S = matrix(0, n + m, n + m)                        # initialize S
S[(m + 1):(n + m), (m + 1):(n + m)] = Gramm        # non-zero part of S

M = crossprod(Q) + lambda*S
M_inv = inverse(M)                                 # inverse of M

gamma_hat = crossprod(M_inv, crossprod(Q, y))
f_hat = Q %*% gamma_hat

A    = Q %*% M_inv %*% t(Q)
tr_A = sum(diag(A))                                # trace of hat matrix

rss = crossprod(y - f_hat)                         # residual sum of squares
gcv = n * rss/(n - tr_A)^2                         # obtain GCV score

list(
f_hat = f_hat,
gamma_hat = gamma_hat,
gcv = gcv
)
}

### Estimation

We’ll now get fits for multiple lambda values.

lambda = 10^seq(-6, 0, by = .1)

gcv_search = map(lambda, function(lam) smoothing_spline(x, y, lam))

V = map_dbl(gcv_search, function(x) x$gcv) Plot of GCV across different lambdas. ### Comparison I’ve added comparison to an additive model using mgcv. Compare the result to Figure 2 of the Supplementary Material. fit_rk = smoothing_spline(x, y, lambda[which.min(V)]) # fit optimal model fit_gam = mgcv::gam(y ~ s(x)) ## Source Original code available at: https://github.com/m-clark/Miscellaneous-R-Code/tree/master/ModelFitting/RKHSReg ### Current Supplemental Code You can peruse the supplemental section that shows the R code here. ### Original Supplemental Code This was the old original code from the supplemental section for the article, but was changed at some point (I can’t remember, it may have been at one of the author’s website). The R code on the repo follows these examples, while this document follows the currently accessible supplementary material. I used RStudio’s default cleanup to make the code a little easier to read, and maybe added a little spacing, but otherwise it is identical to what’s in the linked paper. #### A.1 ###### Data ######## set.seed(3) n <- 20 x1 <- runif(n) x2 <- runif(n) X <- matrix(c(x1, x2), ncol = 2) # design matrix y <- 2 + 3 * x1 + rnorm(n, sd = 0.25) ##### function to find the inverse of a matrix #### my.inv <- function(X, eps = 1e-12) { eig.X <- eigen(X, symmetric = T) P <- eig.X[] lambda <- eig.X[] ind <- lambda > eps lambda[ind] <- 1 / lambda[ind] lambda[!ind] <- 0 ans <- P %*% diag(lambda, nrow = length(lambda)) %*% t(P) return(ans) } ###### Reproducing Kernel ######### rk <- function(s, t) { p <- length(s) rk <- 0 for (i in 1:p) { rk <- s[i] * t[i] + rk } return((rk)) } ##### Gram matrix ####### get.gramm <- function(X) { n <- dim(X) Gramm <- matrix(0, n, n) #initializes Gramm array #i=index for rows #j=index for columns Gramm<-as.matrix(Gramm) # Gramm matrix for (i in 1:n) { for (j in 1:n) { Gramm[i, j] <- rk(X[i,], X[j,]) } } return(Gramm) } ridge.regression <- function(X, y, lambda) { Gramm <- get.gramm(X) #Gramm matrix (nxn) n <- dim(X) # n=length of y J <- matrix(1, n, 1) # vector of ones dim Q <- cbind(J, Gramm) # design matrix m <- 1 # dimension of the null space of the penalty S <- matrix(0, n + m, n + m) #initialize S S[(m + 1):(n + m), (m + 1):(n + m)] <- Gramm #non-zero part of S M <- (t(Q) %*% Q + lambda * S) M.inv <- my.inv(M) # inverse of M gamma.hat <- crossprod(M.inv, crossprod(Q, y)) f.hat <- Q %*% gamma.hat A <- Q %*% M.inv %*% t(Q) tr.A <- sum(diag(A)) #trace of hat matrix rss <- t(y - f.hat) %*% (y - f.hat) #residual sum of squares gcv <- n * rss / (n - tr.A) ^ 2 #obtain GCV score return(list( f.hat = f.hat, gamma.hat = gamma.hat, gcv = gcv )) } # Plot of GCV lambda <- 1e-8 V <- rep(0, 40) for (i in 1:40) { V[i] <- ridge.regression(X, y, lambda)$gcv #obtain GCV score
lambda <- lambda * 1.5 #increase lambda
}

index <- (1:40)
plot(
1.5 ^ (index - 1) * 1e-8,
V,
type = "l",
main = "GCV
score",
lwd = 2,
xlab = "lambda",
ylab = "GCV"
) # plot score

i <- (1:60)[V == min(V)] # extract index of min(V)
opt.mod <- ridge.regression(X, y, 1.5 ^ (i - 1) * 1e-8) #fit optimal model

### finding beta.0, beta.1 and beta.2 ##########
gamma.hat <- opt.mod$gamma.hat beta.hat.0 <- opt.mod$gamma.hat#intercept
beta.hat <- gamma.hat[2:21,] %*% X #slope and noise term coefficients

#### Fitted Line Plot for Cubic Smoothing Spline ####
plot(x[,1],y,xlab="x",ylab="response",main="Cubic Smoothing Spline") ;
lines(x[,1],opt.mod$f.hat,type="l",lty=1,lwd=2,col="blue") ; #### A.2 A.2 RKHS solution applied to Cubic Smoothing Spline We consider a sample of size n = 50, ($$y_1, y_2, y_3, ..., y_{50}$$), from the model $$y_i = sin(2πx_i) + ϵ_i$$ where ϵi has a N(0, 0.22) . The following code generates x and y… A simple direct search for the GCV optimal smoothing parameter can be made as follows: Now we have to find an optimal lambda using GCV… Below we give a function to find the cubic smoothing spline using the RKHS framework we discussed in Section 4.3. We also provide a graph with our estimation along with the true function and data. ###### Data ######## set.seed(3) n <- 50 x <- matrix(runif(n), nrow, ncol = 1) x.star <- matrix(sort(x), nrow, ncol = 1) # sorted x, used by plot y <- sin(2 * pi * x.star) + rnorm(n, sd = 0.2) #### Reproducing Kernel for <f,g>=int_0^1 f’’(x)g’’(x)dx ##### rk.1 <- function(s, t) { return((1 / 2) * min(s, t) ^ 2) * (max(s, t) + (1 / 6) * (min(s, t)) ^ 3) } get.gramm.1 <- function(X) { n <- dim(X) Gramm <- matrix(0, n, n) #initializes Gramm array #i=index for rows #j=index for columns Gramm <- as.matrix(Gramm) # Gramm matrix for (i in 1:n) { for (j in 1:n) { Gramm[i, j] <- rk.1(X[i, ], X[j, ]) } } return(Gramm) } smoothing.spline <- function(X, y, lambda) { Gramm <- get.gramm.1(X) #Gramm matrix (nxn) n <- dim(X) # n=length of y J <- matrix(1, n, 1) # vector of ones dim T <- cbind(J, X) # matrix with a basis for the null space of the penalty Q <- cbind(T, Gramm) # design matrix m <- dim(T) # dimension of the null space of the penalty S <- matrix(0, n + m, n + m) #initialize S S[(m + 1):(n + m), (m + 1):(n + m)] <- Gramm #non-zero part of S M <- (t(Q) %*% Q + lambda * S) M.inv <- my.inv(M) # inverse of M gamma.hat <- crossprod(M.inv, crossprod(Q, y)) f.hat <- Q %*% gamma.hat A <- Q %*% M.inv %*% t(Q) tr.A <- sum(diag(A)) #trace of hat matrix rss <- t(y - f.hat) %*% (y - f.hat) #residual sum of squares gcv <- n * rss / (n - tr.A) ^ 2 #obtain GCV score return(list( f.hat = f.hat, gamma.hat = gamma.hat, gcv = gcv )) } ### Now we have to find an optimal lambda using GCV... ### Plot of GCV lambda <- 1e-8 V <- rep(0, 60) for (i in 1:60) { V[i] <- smoothing.spline(x.star, y, lambda)$gcv #obtain GCV score
lambda <- lambda * 1.5 #increase lambda
}
plot(1:60,
V,
type = "l",
main = "GCV score",
xlab = "i") # plot score
i <- (1:60)[V == min(V)] # extract index of min(V)
fit_rk <- smoothing.spline(x.star, y, 1.5 ^ (i - 1) * 1e-8) #fit optimal
model

#Graph (Cubic Spline)
plot(
x.star,
fit_rk\$f.hat,
type = "l",
lty = 2,
lwd = 2,
col = "blue",
xlab = "x",
ylim = c(-2.5, 1.5),
xlim = c(-0.1, 1.1),
ylab = "response",
main = "Cubic Spline"
)
#predictions
lines(x.star, sin(2 * pi * x.star), lty = 1, lwd = 2) #true
legend(
-0.1,
-1.5,
c("predictions", "true"),
lty = c(2, 1),
bty = "n",
lwd = c(2, 2),
col = c("blue", "black")
)
points(x.star, y)