Supplemental
Other Languages
When doing some of these models and algorithms, I had some other code to work with in another language, or, at the time, just wanted to try it in that language. There is not a whole lot here, but it still may be useful to some. Refer to the corresponding chapter of R code for context.
Python Demos
Linear Regression
Data Setup
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.optimize import minimize
np.random.seed(123) # ensures replication
# predictors and response
# increasing N will get estimated values closer to the known parameters
N = 1000 # sample size
k = 2 # number of desired predictors
X = np.matrix(np.random.normal(size = N * k)).reshape(N, k)
y = -.5 + .2*X[:, 0] + .1*X[:, 1] + np.random.normal(scale = .5, size = N).reshape(N, 1)
dfXy = pd.DataFrame(np.column_stack([X, y]), columns = ['X1', 'X2', 'y'])Functions
A maximum likelihood approach.
def lm_ml(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response
# setup
beta = par[1:].reshape(X.shape[1], 1) # coefficients
sigma = par[0] # error sd
# N = X.shape[0]
# linear predictor
LP = X * beta # linear predictor
mu = LP # identity link in the glm sense
# calculate likelihood
L = norm.logpdf(y, loc = mu, scale = sigma) # log likelihood; or use norm.logpdf
# L = -.5*N*log(sigma2) - .5*(1/sigma2)*crossprod(y-mu) # alternate log likelihood form
L = -np.sum(L) # optim by default is minimization, and we want to maximize the likelihood
return(L)An approach via least squares loss function.
def lm_ls(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response
# setup
beta = par.reshape(3, 1) # coefficients
N = X.shape[0]
p = X.shape[1]
# linear predictor
LP = X * beta # linear predictor
mu = LP # identity link in the glm sense
#
# # squared error loss
#
return(np.sum(np.square(y - mu)))Estimation
X_mm = np.column_stack([np.repeat(1, N).reshape(N, 1), X])
# you may get warnings, they can be ignored
fit_ml = minimize(
fun = lm_ml,
x0 = [1, 0, 0, 0],
args = (X_mm, y),
bounds = ((0, None), (None, None), (None, None), (None, None)),
method = 'L-BFGS-B',
tol = 1e-12,
options = {'maxiter': 500}
)
# can use least_squares directly
# from scipy.optimize import least_squares
fit_ls = minimize(
lm_ls,
x0 = np.array([0, 0, 0]),
args = (X_mm, y),
tol = 1e-12,
options = {'maxiter': 500}
)Comparison
import statsmodels.formula.api as smf
model_sm_ols = smf.ols('y ~ X1 + X2', data = dfXy)
fit_sm_ols = model_sm_ols.fit()
pd.DataFrame(
[
np.append(fit_ml.x[0]**2, fit_ml.x[1:]),
np.append(fit_ls.fun/(N - X_mm.shape[1] - 1), fit_ls.x),
np.append(fit_sm_ols.scale, fit_sm_ols.params)
],
columns = ['sigma','Int', 'b_X1', 'b_X2'],
index = ['ML', 'OLS', 'SM']
)
sigma Int b_X1 b_X2
ML 0.240332 -0.494647 0.213036 0.0815
OLS 0.241297 -0.494647 0.213036 0.0815
SM 0.241055 -0.494647 0.213036 0.0815
Logistic Regression
Data Setup
import numpy as np
import pandas as pd
from scipy.stats import logistic, binom
from scipy.optimize import minimize
np.random.seed(123) # ensures replication
# predictors and response
# increasing N will get estimated values closer to the known parameters
N = 2500 # sample size
k = 2 # number of desired predictors
X = np.matrix(np.random.normal(size = N * k)).reshape(N, k)
eta = -.5 + .2*X[:, 0] + .1*X[:, 1]
pi = 1/(1 + np.exp(-eta))
y = np.random.binomial(1, p = pi, size = (N, 1))
dfXy = pd.DataFrame(np.column_stack([X, y]), columns = ['X1', 'X2', 'y'])Functions
A maximum likelihood approach.
def logreg_ml(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response
# setup
beta = par.reshape(X.shape[1], 1) # coefficients
# N = X.shape[0]
# linear predictor
LP = np.dot(X, beta) # linear predictor
mu = 1/(1 + np.exp(-LP)) # logit link
# calculate likelihood
L = binom.logpmf(y, 1, mu) # log likelihood
# L = np.multiply(y, np.log(mu)) + np.multiply(1 - y, np.log(1 - mu)) # alternate log likelihood form
L = -np.sum(L) # optim by default is minimization, and we want to maximize the likelihood
return(L)Another approach via exponential loss function.
def logreg_exp(par, X, y):
# Arguments
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: target
# setup
beta = par.reshape(X.shape[1], 1) # coefficients
# linear predictor
LP = np.dot(X, beta) # linear predictor
# calculate exponential loss function (convert y to -1:1 from 0:1)
L = np.sum(np.exp(-np.multiply(np.where(y, 1, -1), LP) * .5))
return(L)Estimation
Setup for use with optim.
X_mm = np.column_stack([np.repeat(1, N).reshape(N, 1), X])
fit_ml = minimize(
fun = logreg_ml,
x0 = [0, 0, 0],
args = (X_mm, y),
method = 'BFGS',
tol = 1e-12,
options = {'maxiter': 500}
)
fit_exp = minimize(
fun = logreg_exp,
x0 = [0, 0, 0],
args = (X_mm, y),
method = 'BFGS',
tol = 1e-12,
options = {'maxiter': 500}
)
pars_ml = fit_ml.x
pars_exp = fit_exp.xComparison
import statsmodels.formula.api as smf
model_sm_glm = smf.logit('y ~ X1 + X2', data = dfXy)
fit_sm_glm = model_sm_glm.fit()
pd.DataFrame(
[
pars_ml,
pars_exp,
fit_sm_glm.params
],
columns = ['Int', 'b_X1', 'b_X2'],
index = ['fit_ml', 'fit_exp', 'SM']
)
Int b_X1 b_X2
fit_ml -0.505637 0.234567 0.054700
fit_exp -0.504802 0.231859 0.053134
SM -0.505637 0.234567 0.054700
Quantile Regression
See the R chapter for details.
Data Setup
import pandas as pd
import numpy as np
# originally from quantreg package
engel = pd.read_csv('data/engel.csv')
X = np.column_stack([
np.ones((engel.shape[0],1)),
np.asarray(engel['income'])
])
y = np.asarray(engel['foodexp'])def qreg(par, X, y, tau):
lp = np.dot(X, par)
res = y - lp
loss = np.where(res < 0 , -(1 - tau)*res, tau*res)
return(np.sum(loss))Estimation
We’ll estimate the median to start.
# from scipy.stats import logistic, binom
from scipy.optimize import minimize
minimize(
fun = qreg,
x0 = [0, 0],
args = (X, y, .5),
method = 'BFGS',
tol = 1e-12,
options = {'maxiter': 500}
)Other quantiles
Now we will add additional quantiles to estimate.
pandas as pd
# quantiles
qs = [.05, .1, .25, .5, .75, .9, .95]
fit = []
for tau in qs:
init = minimize(
fun = qreg,
x0 = [0, 0],
args = (X, y, tau),
method = 'BFGS',
tol = 1e-12,
options = {'maxiter': 500}
)
fit.append(
pd.DataFrame(init.x.reshape(1,2), columns = ['int', 'slope'])
)
fit_qr = pd.concat(fit)
from plotnine import *
(
ggplot(aes(x = 'income', y = 'foodexp'), data = engel) +
geom_point() +
geom_abline(aes(intercept = 'int', slope = 'slope', color = 'int'),
data = fit_qr)
)Stochastic Gradient Descent
Here we use the adagrad approach as in the R demo. The dataset shift and other variants are not demonstrated.
Data Setup
Create some data for a standard linear regression.
import numpy as np
np.random.seed(1234)
n = 10000
x1 = np.random.normal(size = n)
x2 = np.random.normal(size = n)
y = 1 + .5*x1 + .2*x2 + np.random.normal(size = n)
X = np.column_stack([np.ones(n), x1, x2])Function
The estimating function using the adagrad approach.
def sgd(par, X, y, stepsize, stepsize_tau = 0, average = False):
"""
Estimate a linear regression via stochastic gradient descent
par: parameter estimates
X: model matrix
y: target variable
stepsize: the learning rate
stepsize_tau: if > 0, a check on the LR at early iterations
average: an alternative method
Returns a dictionary of the parameter estimates, the estimates for each iteration,
the loss for each iteration, the root mean square error, and the fitted values
using the final parameter estimates
"""
beta = par
betamat = np.zeros_like(X)
fits = np.zeros_like(y) # if you want these across iterations
loss = np.zeros_like(y)
s = 0
for i in np.arange(X.shape[0]):
Xi = X[i]
yi = y[i]
LP = np.dot(Xi, beta)
grad = np.dot(Xi.T, LP - yi)
s = s + grad**2
beta -= stepsize * grad/(stepsize_tau + np.sqrt(s))
if average and i > 1:
beta -= 1/i * (betamat[i - 1] - beta)
betamat[i] = beta
fits[i] = LP
loss[i] = (LP - yi)**2
LP = np.dot(X, beta)
lastloss = np.dot(LP - y, LP - y)
return({
'par': beta, # final estimates
'par_chain': betamat, # all estimates
'loss': loss, # observation level loss
'RMSE': np.sqrt(np.sum(lastloss)/X.shape[0]),
'fitted': LP
})Set starting values and estimate. For any particular data you might have to fiddle with the stepsize, perhaps
choosing one based on cross-validation with old data.
init = np.zeros(3)
fit_sgd = sgd(
init,
X,
y,
stepsize = .1,
stepsize_tau = .5
)
fit_sgd['par'].round(4)
fit_sgd['RMSE'].round(4)
array([1.0172, 0.4942, 0.1937])
0.9978
Comparison
We’ll compare our results to standard linear regression.
import pandas as pd
import statsmodels.formula.api as smf
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
lm_fit = smf.ols('y ~ x1 + x2', df).fit()
lm_fit.params.round(4)
lm_fit.scale.round(4)
Intercept 1.0037
x1 0.5032
x2 0.2000
dtype: float64
0.9955
Nelder-Mead
import copy
'''
Francois Chollet's Nelder-Mead in Python.
https://github.com/fchollet/nelder-mead/blob/master/nelder_mead.py
Pure Python/Numpy implementation of the Nelder-Mead algorithm.
Reference: https://en.wikipedia.org/wiki/Nelder%E2%80%93Mead_method
'''
def nelder_mead(
f,
x_start,
step = 0.1,
no_improve_thr = 10e-6,
no_improv_break = 10,
max_iter = 0,
alpha = 1.,
gamma = 2.,
rho = 0.5,
sigma = 0.5
):
'''
@param f (function): function to optimize, must return a scalar score
and operate over a numpy array of the same dimensions as x_start
@param x_start (numpy array): initial position
@param step (float): look-around radius in initial step
@no_improv_thr, no_improv_break (float, int): break after no_improv_break iterations with
an improvement lower than no_improv_thr
@max_iter (int): always break after this number of iterations.
Set it to 0 to loop indefinitely.
@alpha, gamma, rho, sigma (floats): parameters of the algorithm
(see Wikipedia page for reference)
return: tuple (best parameter array, best score)
'''
# init
dim = len(x_start)
prev_best = f(x_start)
no_improv = 0
res = [[x_start, prev_best]]
for i in range(dim):
x = copy.copy(x_start)
x[i] = x[i] + step
score = f(x)
res.append([x, score])
# simplex iter
iters = 0
while 1:
# order
res.sort(key=lambda x: x[1])
best = res[0][1]
# break after max_iter
if max_iter and iters >= max_iter:
return res[0]
iters += 1
# break after no_improv_break iterations with no improvement
if iters//10 == 0:
print('...best so far:', best)
if best < prev_best - no_improve_thr:
no_improv = 0
prev_best = best
else:
no_improv += 1
if no_improv >= no_improv_break:
return res[0]
# centroid
x0 = [0.] * dim
for tup in res[:-1]:
for i, c in enumerate(tup[0]):
x0[i] += c / (len(res)-1)
# reflection
xr = x0 + alpha*(x0 - res[-1][0])
rscore = f(xr)
if res[0][1] <= rscore < res[-2][1]:
del res[-1]
res.append([xr, rscore])
continue
# expansion
if rscore < res[0][1]:
xe = x0 + gamma*(x0 - res[-1][0])
escore = f(xe)
if escore < rscore:
del res[-1]
res.append([xe, escore])
continue
else:
del res[-1]
res.append([xr, rscore])
continue
# contraction
xc = x0 + rho*(x0 - res[-1][0])
cscore = f(xc)
if cscore < res[-1][1]:
del res[-1]
res.append([xc, cscore])
continue
# reduction
x1 = res[0][0]
nres = []
for tup in res:
redx = x1 + sigma*(tup[0] - x1)
score = f(redx)
nres.append([redx, score])
res = nres
if __name__ == "__main__":
# test
import math
import numpy as np
def f(x):
return math.sin(x[0]) * math.cos(x[1]) * (1. / (abs(x[2]) + 1))
nelder_mead(f, np.array([0., 0., 0.]))HMM
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
From the wikipedia page with slight modification
https://en.wikipedia.org/wiki/Viterbi_algorithm#Example
"""
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}]
for st in states:
V[0][st] = {"prob": start_p[st] * emit_p[st][obs[0]], "prev": None}
# Run Viterbi when t > 0
for t in range(1, len(obs)):
V.append({})
for st in states:
max_tr_prob = max(V[t-1][prev_st]["prob"]*trans_p[prev_st][st] for prev_st in states)
for prev_st in states:
if V[t-1][prev_st]["prob"] * trans_p[prev_st][st] == max_tr_prob:
max_prob = max_tr_prob * emit_p[st][obs[t]]
V[t][st] = {"prob": max_prob, "prev": prev_st}
break
for line in dptable(V):
print(line)
opt = []
# The highest probability
max_prob = max(value["prob"] for value in V[-1].values())
previous = None
# Get most probable state and its backtrack
for st, data in V[-1].items():
if data["prob"] == max_prob:
opt.append(st)
previous = st
break
# Follow the backtrack till the first observation
for t in range(len(V) - 2, -1, -1):
opt.insert(0, V[t + 1][previous]["prev"])
previous = V[t + 1][previous]["prev"]
print('The steps of states are ' + ' '.join(opt) + ' with highest probability of %s' % max_prob)
def dptable(V):
# Print a table of steps from dictionary
yield " ".join(("%12d" % i) for i in range(len(V)))
for state in V[0]:
yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)
# The function viterbi takes the following arguments: obs is the sequence of
# observations, e.g. ['normal', 'cold', 'dizzy']; states is the set of hidden
# states; start_p is the start probability; trans_p are the transition
# probabilities; and emit_p are the emission probabilities. For simplicity of
# code, we assume that the observation sequence obs is non-empty and that
# trans_p[i][j] and emit_p[i][j] is defined for all states i,j.
# In the running example, the forward/Viterbi algorithm is used as follows:
obs = ('normal', 'cold', 'dizzy')
states = ('Healthy', 'Fever')
start_p = {'Healthy': 0.6, 'Fever': 0.4}
trans_p = {
'Healthy' : {'Healthy': 0.7, 'Fever': 0.3},
'Fever' : {'Healthy': 0.4, 'Fever': 0.6}
}
emit_p = {
'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
}
viterbi(obs,states,start_p,trans_p,emit_p)Julia Demos
I haven’t played with Julia in a very long time, but briefly hacked the old code to get something that worked. As Julia has gone though notable changes, it’s doubtful these are very good as far as Julia programming standards go, though conceptually they may still provide some utility. Perhaps at some point I’ll reteach myself the basics and come back to these. In any case the code did run on a Jupyter notebook.
Mixed Models
One-factor
#####################
### Main function ###
#####################
using LinearAlgebra
using Statistics
function one_factor_re_loglike(par::Vector)
d, ni = size(y)
mu = par[1]
sigma2_mu = par[2]
sigma2 = par[3]
Sigmai = sigma2*I(ni) + sigma2_mu*ones(ni, ni)
l = -(ni*d)/2*log(2*pi) - d/2*log(det(Sigmai))
for i in 1:d
yi = y[i,:]
l = l - .5(yi .- mu)' * (Sigmai\(yi .- mu))
end
l = -l[1]
return l
end###################
### Data set up ###
###################
y = [22.6 20.5 20.8
22.6 21.2 20.5
17.3 16.2 16.6
21.4 23.7 23.2
20.9 22.2 22.6
14.5 10.5 12.3
20.8 19.1 21.3
17.4 18.6 18.6
25.1 24.8 24.9
14.9 16.3 16.6]
################################
### Starting values and test ###
################################
using Statistics
mu0 = mean(y)
sigma2_mu0 = var(mean(y, dims = 2))
sigma20 = mean(var(y, dims = 2))
theta0 = [mu0, sigma2_mu0, sigma20]
### test
one_factor_re_loglike(theta0)###########
### Run ###
###########
using Optim
res = optimize(one_factor_re_loglike, theta0, LBFGS())
res
* Status: success
* Candidate solution
Final objective value: 6.226441e+01
* Found with
Algorithm: L-BFGS
* Convergence measures
|x - x'| = 2.93e-08 ≰ 0.0e+00
|x - x'|/|x'| = 1.49e-09 ≰ 0.0e+00
|f(x) - f(x')| = 1.42e-14 ≰ 0.0e+00
|f(x) - f(x')|/|f(x')| = 2.28e-16 ≰ 0.0e+00
|g(x)| = 1.17e-09 ≤ 1.0e-08
* Work counters
Seconds run: 1 (vs limit Inf)
Iterations: 7
f(x) calls: 22
∇f(x) calls: 22
Optim.minimizer(res)
3-element Array{Float64,1}:
19.599999980440952
12.193999992338886
1.1666666662195693
Optim.minimum(res)62.30661224610756
Two-factor
using LinearAlgebra
using Statistics
function sfran2_loglike(par::Vector)
n = length(y)
mu = par[1]
sigma2_alpha = exp(par[2])
sigma2_gamma = exp(par[3])
sigma2 = exp(par[4])
Sigma = sigma2*I(n) + sigma2_alpha*(Xalpha * Xalpha') + sigma2_gamma * (Xgamma * Xgamma')
l = -n/2*log(2*pi) - sum(log.(diag(cholesky(Sigma).L))) - .5*(y .- mu)' * (Sigma\(y .- mu))
l = -l[1]
return l
end##################
### Data setup ###
##################
y = [1.39,1.29,1.12,1.16,1.52,1.62,1.88,1.87,1.24,1.18,
.95,.96,.82,.92,1.18,1.20,1.47,1.41,1.57,1.65]
################################
### Starting values and test ###
################################
yhat = mean(reshape(y, 4, 5), 1)
theta0 = [mean(y), log(var(yhat)), log(var(y)/3), log(var(y)/3)]
sfran2_loglike(theta0)###########
### Run ###
###########
using Optim
res = optimize(sfran2_loglike, theta0, method = :l_bfgs)
res
* Status: success
* Candidate solution
Final objective value: -1.199315e+01
* Found with
Algorithm: L-BFGS
* Convergence measures
|x - x'| = 6.60e-09 ≰ 0.0e+00
|x - x'|/|x'| = 1.08e-09 ≰ 0.0e+00
|f(x) - f(x')| = 5.33e-15 ≰ 0.0e+00
|f(x) - f(x')|/|f(x')| = 4.44e-16 ≰ 0.0e+00
|g(x)| = 7.02e-10 ≤ 1.0e-08
* Work counters
Seconds run: 0 (vs limit Inf)
Iterations: 9
f(x) calls: 23
∇f(x) calls: 23
exp.(Optim.minimizer(res))
4-element Array{Float64,1}:
3.7434213772629223
0.053720000000540405
0.031790000003692476
0.002290000000530042
-2*Optim.minimum(res)23.98630759443859
Matlab Demos
I don’t code in Matlab, nor have any particular desire to, so this is provided here just for reference because I happened to have the code.
Mixed Models
One-factor
% matlab from Statistical Modeling and Computation (2014 p 311). See the
% associated twofactorRE.R file for details.
function one_factor_re_loglike(mu, sigma2_mu, sigma2, y)
[d ni] = size(y);
Sigmai = sigma2*eye(ni) + sigma2_mu*ones(ni,ni);
l = -(ni*d) / 2*log(2*pi) - d / 2*log(det(Sigmai));
for i=1:d
yi = y(i, :)';
l = l - .5*(yi - mu)' * (Sigmai\(yi - mu));
end
end
y = [22.6 20.5 20.8;
22.6 21.2 20.5;
17.3 16.2 16.6;
21.4 23.7 23.2;
20.9 22.2 22.6;
14.5 10.5 12.3;
20.8 19.1 21.3;
17.4 18.6 18.6;
25.1 24.8 24.9;
14.9 16.3 16.6];
f = @(theta) -one_factor_re_loglike(theta(1), theta(2), theta(3), y);
ybar = mean(y, 2);
theta0 = [mean(ybar) var(ybar) mean(var(y, 0, 2))];
thetahat = fminsearch(f, theta0);Two-factor
% matlab from Statistical Modeling and Computation (2014 p 314). See the
% associated twofactorRE.R file for details.
function sfran2_loglike(mu, eta_alpha, eta_gamma, eta, y, Xalpha, Xgamma)
sigma2_alpha = exp(eta_alpha);
sigma2_gamma = exp(eta_gamma);
sigma2 = exp(eta);
n = length(y);
Sigma = sigma2*speye(n) + sigma2_alpha * (Xalpha * Xalpha') + sigma2_gamma * (Xgamma*Xgamma');
l = -n/2 * log(2*pi) - sum(log(diag(chol(Sigma)))) - .5*(y - mu)' * (Sigma\(y - mu));
end
y = [1.39 1.29 1.12 1.16 1.52 1.62 1.88 1.87 1.24 1.18 .95 .96 .82 .92 1.18 1.20 1.47 1.41 1.57 1.65];
Xalpha = kron(speye(5), ones(4,1));
Xgamma = kron(speye(10), ones(2,1));
f = @(theta) -sfran_loglike(theta(1), theta(2), theta(3), theta(4), y, Xalpha, Xgamma);
yhat = mean(reshape(y, 4, 5));
theta0 = [mean(y) log(var(yhat)) log(var(y)/3) log(var(y)/3)];
thetahat = fminsearch(f, theta0)Gaussian Processes
Any updates on the following can be found at the repo.
function S = gaussSample(arg1, arg2, arg3)
% Returns n samples (in the rows) from a multivariate Gaussian distribution
%
% Examples:
% S = gaussSample(mu, Sigma, 10)
% S = gaussSample(model, 100)
% S = gaussSample(struct('mu',[0], 'Sigma', eye(1)), 3)
% This file is from pmtk3.googlecode.com
switch nargin
case 3, mu = arg1; Sigma = arg2; n = arg3;
case 2, model = arg1; mu = model.mu; Sigma = model.Sigma; n = arg2;
case 1, model = arg1; mu = model.mu; Sigma = model.Sigma; n = 1;
otherwise
error('bad num args')
end
A = chol(Sigma, 'lower');
Z = randn(length(mu), n);
S = bsxfun(@plus, mu(:), A*Z)';
end%% Visualize the effect of change the hyper-params for a 1d GP regression
% based on demo_gpr by Carl Rasmussen
%
%% Generate data
% This file is from pmtk3.googlecode.com
n = 20;
rand('state',18);
randn('state',20);
covfunc = {'covSum', {'covSEiso','covNoise'}};
loghyper = [log(1.0); log(1.0); log(0.1)];
x = 15*(rand(n,1)-0.5);
y = chol(feval(covfunc{:}, loghyper, x))'*randn(n,1); % Cholesky decomp.
xstar = linspace(-7.5, 7.5, 201)';
hyps = [log(1), log(1), log(0.1);...
log(0.3),log(1.08),log(0.00005);...
log(3),log(1.16),log(0.89)];
%% compute post pred and plot marginals
for i=1:size(hyps,1)
loghyper = hyps(i,:)';
[mu, S2] = gpr(loghyper, covfunc, x, y, xstar);
S2 = S2 - exp(2*loghyper(3)); % remove observation noise
figure;
f = [mu+2*sqrt(S2);flipdim(mu-2*sqrt(S2),1)];
fill([xstar; flipdim(xstar,1)], f, [7 7 7]/8, 'EdgeColor', [7 7 7]/8);
hold on
plot(xstar,mu,'k-','LineWidth',2);
plot(x, y, 'k+', 'MarkerSize', 17);
axis([-8 8 -3 3])
printPmtkFigure(sprintf('gprDemoChangeHparams%d', i));
end%% Reproduce figure 2.2 from GP book
%
%%
% This file is from pmtk3.googlecode.com
setSeed(0);
L = 1;
xs = (-5:0.2:5)';
ns = length(xs);
keps = 1e-8;
muFn = @(x) 0*x(:).^2;
Kfn = @(x,z) 1*exp(-sq_dist(x'/L,z'/L)/2);
% plot sampled functions from the prior
figure; hold on
for i=1:3
model = struct('mu', muFn(xs), 'Sigma', Kfn(xs, xs) + 1e-15*eye(size(xs, 1)));
fs = gaussSample(model, 1);
plot(xs, fs, 'k-', 'linewidth', 2)
end
printPmtkFigure('gprDemoNoiseFreePrior')
% generate noise-less training data
Xtrain = [-4, -3, -2, -1, 1]';
ftrain = sin(Xtrain);
% compute posterior predictive
K = Kfn(Xtrain, Xtrain); % K
Ks = Kfn(Xtrain, xs); %K_*
Kss = Kfn(xs, xs) + keps*eye(length(xs)); % K_** (keps is essential!)
Ki = inv(K);
postMu = muFn(xs) + Ks'*Ki*(ftrain - muFn(Xtrain));
postCov = Kss - Ks'*Ki*Ks;
figure; hold on
% plot marginal posterior variance as gray band
mu = postMu(:);
S2 = diag(postCov);
f = [mu+2*sqrt(S2);flipdim(mu-2*sqrt(S2),1)];
fill([xs; flipdim(xs,1)], f, [7 7 7]/8, 'EdgeColor', [7 7 7]/8);
% plot samples from posterior predictive
for i=1:3
model = struct('mu', postMu(:)', 'Sigma', postCov);
fs = gaussSample(model, 1);
plot(xs, fs, 'k-', 'linewidth', 2)
h=plot(Xtrain, ftrain, 'kx', 'markersize', 12, 'linewidth', 3);
end
printPmtkFigure('gprDemoNoiseFreePost')