# Supplemental

## Other Languages

When doing some of these models and algorithms, I had some other code to work with in another language, or, at the time, just wanted to try it in that language. There is not a whole lot here, but it still may be useful to some. Refer to the corresponding chapter of R code for context.

## Python Demos

### Linear Regression

#### Data Setup

import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.optimize import minimize

np.random.seed(123) # ensures replication

# predictors and response
# increasing N will get estimated values closer to the known parameters

N = 1000 # sample size
k = 2   # number of desired predictors
X = np.matrix(np.random.normal(size = N * k)).reshape(N, k)
y = -.5 + .2*X[:, 0] + .1*X[:, 1] + np.random.normal(scale = .5, size = N).reshape(N, 1)

dfXy = pd.DataFrame(np.column_stack([X, y]), columns = ['X1', 'X2', 'y'])

#### Functions

A maximum likelihood approach.

def lm_ml(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response

# setup
beta   = par[1:].reshape(X.shape, 1)    # coefficients
sigma  = par                            # error sd
# N = X.shape

# linear predictor
LP = X * beta                              # linear predictor
mu = LP                                    # identity link in the glm sense

# calculate likelihood

L = norm.logpdf(y, loc = mu, scale = sigma) # log likelihood; or use norm.logpdf

# L =  -.5*N*log(sigma2) - .5*(1/sigma2)*crossprod(y-mu)    # alternate log likelihood form

L = -np.sum(L)       # optim by default is minimization, and we want to maximize the likelihood

return(L)

An approach via least squares loss function.

def lm_ls(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response

# setup
beta = par.reshape(3, 1)              # coefficients
N = X.shape
p = X.shape

# linear predictor
LP = X * beta                              # linear predictor
mu = LP                                    # identity link in the glm sense
#
# # squared error loss
#
return(np.sum(np.square(y - mu)))

#### Estimation

X_mm = np.column_stack([np.repeat(1, N).reshape(N, 1), X])

# you may get warnings, they can be ignored
fit_ml = minimize(
fun = lm_ml,
x0  = [1, 0, 0, 0],
args    = (X_mm, y),
bounds  = ((0, None), (None, None), (None, None), (None, None)),
method  = 'L-BFGS-B',
tol     = 1e-12,
options = {'maxiter': 500}
)

# can use least_squares directly
# from scipy.optimize import least_squares

fit_ls = minimize(
lm_ls,
x0   = np.array([0, 0, 0]),
args = (X_mm, y),
tol     = 1e-12,
options = {'maxiter': 500}
)

#### Comparison

import statsmodels.formula.api as smf

model_sm_ols = smf.ols('y ~ X1 + X2', data = dfXy)
fit_sm_ols   = model_sm_ols.fit()

pd.DataFrame(
[
np.append(fit_ml.x**2, fit_ml.x[1:]),
np.append(fit_ls.fun/(N - X_mm.shape - 1), fit_ls.x),
np.append(fit_sm_ols.scale, fit_sm_ols.params)
],
columns = ['sigma','Int', 'b_X1', 'b_X2'],
index   = ['ML', 'OLS', 'SM']
)

sigma       Int      b_X1    b_X2
ML   0.240332 -0.494647  0.213036  0.0815
OLS  0.241297 -0.494647  0.213036  0.0815
SM   0.241055 -0.494647  0.213036  0.0815


### Logistic Regression

#### Data Setup

import numpy as np
import pandas as pd
from scipy.stats import logistic, binom
from scipy.optimize import minimize

np.random.seed(123) # ensures replication

# predictors and response
# increasing N will get estimated values closer to the known parameters

N = 2500 # sample size
k = 2   # number of desired predictors
X = np.matrix(np.random.normal(size = N * k)).reshape(N, k)
eta = -.5 + .2*X[:, 0] + .1*X[:, 1]

pi = 1/(1 + np.exp(-eta))

y = np.random.binomial(1, p = pi, size = (N, 1))

dfXy = pd.DataFrame(np.column_stack([X, y]), columns = ['X1', 'X2', 'y'])

#### Functions

A maximum likelihood approach.

def logreg_ml(par, X, y):
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: response

# setup
beta   = par.reshape(X.shape, 1)             # coefficients
# N = X.shape

# linear predictor
LP = np.dot(X, beta)                              # linear predictor
mu = 1/(1 + np.exp(-LP))                   # logit link

# calculate likelihood
L = binom.logpmf(y, 1, mu) # log likelihood

# L =  np.multiply(y, np.log(mu)) + np.multiply(1 - y, np.log(1 - mu))    # alternate log likelihood form

L = -np.sum(L)       # optim by default is minimization, and we want to maximize the likelihood

return(L)

Another approach via exponential loss function.

def logreg_exp(par, X, y):
# Arguments
# par: parameters to be estimated
# X: predictor matrix with intercept column
# y: target

# setup
beta = par.reshape(X.shape, 1)                                    # coefficients

# linear predictor
LP = np.dot(X, beta)                              # linear predictor

# calculate exponential loss function (convert y to -1:1 from 0:1)
L = np.sum(np.exp(-np.multiply(np.where(y, 1, -1), LP) * .5))

return(L)

#### Estimation

Setup for use with optim.

X_mm = np.column_stack([np.repeat(1, N).reshape(N, 1), X])

fit_ml = minimize(
fun = logreg_ml,
x0  = [0, 0, 0],
args    = (X_mm, y),
method  = 'BFGS',
tol     = 1e-12,
options = {'maxiter': 500}
)

fit_exp = minimize(
fun = logreg_exp,
x0  = [0, 0, 0],
args    = (X_mm, y),
method  = 'BFGS',
tol     = 1e-12,
options = {'maxiter': 500}
)

pars_ml  = fit_ml.x
pars_exp = fit_exp.x

#### Comparison

import statsmodels.formula.api as smf

model_sm_glm = smf.logit('y ~ X1 + X2', data = dfXy)
fit_sm_glm   = model_sm_glm.fit()

pd.DataFrame(
[
pars_ml,
pars_exp,
fit_sm_glm.params
],
columns = ['Int', 'b_X1', 'b_X2'],
index   = ['fit_ml', 'fit_exp', 'SM']
)

Int      b_X1      b_X2
fit_ml  -0.505637  0.234567  0.054700
fit_exp -0.504802  0.231859  0.053134
SM      -0.505637  0.234567  0.054700


### Quantile Regression

See the R chapter for details.

#### Data Setup

import pandas as pd
import numpy as np

# originally from quantreg package

X = np.column_stack([
np.ones((engel.shape,1)),
np.asarray(engel['income'])
])

y = np.asarray(engel['foodexp'])
def qreg(par, X, y, tau):
lp = np.dot(X, par)
res = y - lp
loss = np.where(res < 0 , -(1 - tau)*res, tau*res)

return(np.sum(loss))

#### Estimation

We’ll estimate the median to start.

# from scipy.stats import logistic, binom
from scipy.optimize import minimize

minimize(
fun = qreg,
x0  = [0, 0],
args    = (X, y, .5),
method  = 'BFGS',
tol     = 1e-12,
options = {'maxiter': 500}
)

#### Other quantiles

pandas as pd

# quantiles
qs = [.05, .1, .25, .5, .75, .9, .95]
fit = []

for tau in qs:
init = minimize(
fun = qreg,
x0  = [0, 0],
args    = (X, y, tau),
method  = 'BFGS',
tol     = 1e-12,
options = {'maxiter': 500}
)

fit.append(
pd.DataFrame(init.x.reshape(1,2),  columns = ['int', 'slope'])
)

fit_qr = pd.concat(fit)

from plotnine import *

(
ggplot(aes(x = 'income', y = 'foodexp'), data = engel) +
geom_point() +
geom_abline(aes(intercept = 'int', slope = 'slope', color = 'int'),
data = fit_qr)
)

Here we use the adagrad approach as in the R demo. The dataset shift and other variants are not demonstrated.

#### Data Setup

Create some data for a standard linear regression.

import numpy as np

np.random.seed(1234)

n = 10000
x1 = np.random.normal(size = n)
x2 = np.random.normal(size = n)
y = 1 + .5*x1 + .2*x2 + np.random.normal(size = n)

X = np.column_stack([np.ones(n), x1, x2])

#### Function

def sgd(par, X, y, stepsize, stepsize_tau = 0, average = False):
"""
Estimate a linear regression via stochastic gradient descent

par: parameter estimates
X: model matrix
y: target variable
stepsize: the learning rate
stepsize_tau: if > 0, a check on the LR at early iterations
average: an alternative method

Returns a dictionary of the parameter estimates, the estimates for each iteration,
the loss for each iteration, the root mean square error, and the fitted values
using the final parameter estimates

"""

beta = par
betamat = np.zeros_like(X)
fits = np.zeros_like(y)              # if you want these across iterations
loss = np.zeros_like(y)
s = 0

for i in np.arange(X.shape):
Xi = X[i]
yi = y[i]

LP   = np.dot(Xi, beta)
grad = np.dot(Xi.T, LP - yi)
beta -= stepsize * grad/(stepsize_tau + np.sqrt(s))

if average and i > 1:
beta -= 1/i * (betamat[i - 1] - beta)

betamat[i] = beta
fits[i]    = LP
loss[i]    = (LP - yi)**2

LP = np.dot(X, beta)
lastloss = np.dot(LP - y, LP - y)

return({
'par': beta,                                 # final estimates
'par_chain': betamat,                        # all estimates
'loss': loss,                                # observation level loss
'RMSE': np.sqrt(np.sum(lastloss)/X.shape),
'fitted': LP
})

Set starting values and estimate. For any particular data you might have to fiddle with the stepsize, perhaps choosing one based on cross-validation with old data.

init = np.zeros(3)

fit_sgd = sgd(
init,
X,
y,
stepsize = .1,
stepsize_tau = .5
)

fit_sgd['par'].round(4)
fit_sgd['RMSE'].round(4)

array([1.0172, 0.4942, 0.1937])

0.9978


#### Comparison

We’ll compare our results to standard linear regression.

import pandas as pd
import statsmodels.formula.api as smf

df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})

lm_fit = smf.ols('y ~ x1 + x2', df).fit()

lm_fit.params.round(4)
lm_fit.scale.round(4)

Intercept    1.0037
x1           0.5032
x2           0.2000
dtype: float64

0.9955


import copy

'''
Pure Python/Numpy implementation of the Nelder-Mead algorithm.
'''

f,
x_start,
step = 0.1,
no_improve_thr  = 10e-6,
no_improv_break = 10,
max_iter = 0,
alpha = 1.,
gamma = 2.,
rho   = 0.5,
sigma = 0.5
):
'''
@param f (function): function to optimize, must return a scalar score
and operate over a numpy array of the same dimensions as x_start
@param x_start (numpy array): initial position
@param step (float): look-around radius in initial step
@no_improv_thr,  no_improv_break (float, int): break after no_improv_break iterations with
an improvement lower than no_improv_thr
@max_iter (int): always break after this number of iterations.
Set it to 0 to loop indefinitely.
@alpha, gamma, rho, sigma (floats): parameters of the algorithm
return: tuple (best parameter array, best score)
'''

# init
dim = len(x_start)
prev_best = f(x_start)
no_improv = 0
res = [[x_start, prev_best]]

for i in range(dim):
x = copy.copy(x_start)
x[i] = x[i] + step
score = f(x)
res.append([x, score])

# simplex iter
iters = 0
while 1:
# order
res.sort(key=lambda x: x)
best = res

# break after max_iter
if max_iter and iters >= max_iter:
return res
iters += 1

# break after no_improv_break iterations with no improvement
if iters//10 == 0:
print('...best so far:', best)

if best < prev_best - no_improve_thr:
no_improv = 0
prev_best = best
else:
no_improv += 1

if no_improv >= no_improv_break:
return res

# centroid
x0 = [0.] * dim
for tup in res[:-1]:
for i, c in enumerate(tup):
x0[i] += c / (len(res)-1)

# reflection
xr = x0 + alpha*(x0 - res[-1])
rscore = f(xr)
if res <= rscore < res[-2]:
del res[-1]
res.append([xr, rscore])
continue

# expansion
if rscore < res:
xe = x0 + gamma*(x0 - res[-1])
escore = f(xe)
if escore < rscore:
del res[-1]
res.append([xe, escore])
continue
else:
del res[-1]
res.append([xr, rscore])
continue

# contraction
xc = x0 + rho*(x0 - res[-1])
cscore = f(xc)
if cscore < res[-1]:
del res[-1]
res.append([xc, cscore])
continue

# reduction
x1 = res
nres = []
for tup in res:
redx = x1 + sigma*(tup - x1)
score = f(redx)
nres.append([redx, score])
res = nres

if __name__ == "__main__":
# test
import math
import numpy as np

def f(x):
return math.sin(x) * math.cos(x) * (1. / (abs(x) + 1))

nelder_mead(f, np.array([0., 0., 0.]))

### HMM

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
https://en.wikipedia.org/wiki/Viterbi_algorithm#Example
"""

def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}]

for st in states:
V[st] = {"prob": start_p[st] * emit_p[st][obs], "prev": None}

# Run Viterbi when t > 0

for t in range(1, len(obs)):
V.append({})

for st in states:
max_tr_prob = max(V[t-1][prev_st]["prob"]*trans_p[prev_st][st] for prev_st in states)

for prev_st in states:
if V[t-1][prev_st]["prob"] * trans_p[prev_st][st] == max_tr_prob:
max_prob = max_tr_prob * emit_p[st][obs[t]]
V[t][st] = {"prob": max_prob, "prev": prev_st}
break

for line in dptable(V):
print(line)

opt = []

# The highest probability
max_prob = max(value["prob"] for value in V[-1].values())
previous = None

# Get most probable state and its backtrack
for st, data in V[-1].items():
if data["prob"] == max_prob:
opt.append(st)
previous = st
break

# Follow the backtrack till the first observation
for t in range(len(V) - 2, -1, -1):
opt.insert(0, V[t + 1][previous]["prev"])
previous = V[t + 1][previous]["prev"]

print('The steps of states are ' + ' '.join(opt) + ' with highest probability of %s' % max_prob)

def dptable(V):
# Print a table of steps from dictionary
yield " ".join(("%12d" % i) for i in range(len(V)))

for state in V:
yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)

# The function viterbi takes the following arguments: obs is the sequence of
# observations, e.g. ['normal', 'cold', 'dizzy']; states is the set of hidden
# states; start_p is the start probability; trans_p are the transition
# probabilities; and emit_p are the emission probabilities. For simplicity of
# code, we assume that the observation sequence obs is non-empty and that
# trans_p[i][j] and emit_p[i][j] is defined for all states i,j.

# In the running example, the forward/Viterbi algorithm is used as follows:

obs = ('normal', 'cold', 'dizzy')
states = ('Healthy', 'Fever')

start_p = {'Healthy': 0.6, 'Fever': 0.4}

trans_p = {
'Healthy' : {'Healthy': 0.7, 'Fever': 0.3},
'Fever' : {'Healthy': 0.4, 'Fever': 0.6}
}

emit_p = {
'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
}

viterbi(obs,states,start_p,trans_p,emit_p)

## Julia Demos

I haven’t played with Julia in a very long time, but briefly hacked the old code to get something that worked. As Julia has gone though notable changes, it’s doubtful these are very good as far as Julia programming standards go, though conceptually they may still provide some utility. Perhaps at some point I’ll reteach myself the basics and come back to these. In any case the code did run on a Jupyter notebook.

### Mixed Models

#### One-factor

#####################
### Main function ###
#####################

using LinearAlgebra
using Statistics

function one_factor_re_loglike(par::Vector)
d, ni = size(y)
mu = par
sigma2_mu = par
sigma2 = par

Sigmai = sigma2*I(ni) + sigma2_mu*ones(ni, ni)
l = -(ni*d)/2*log(2*pi) - d/2*log(det(Sigmai))

for i in 1:d
yi = y[i,:]
l = l - .5(yi .- mu)' * (Sigmai\(yi .- mu))
end

l = -l

return l
end
###################
### Data set up ###
###################

y = [22.6 20.5 20.8
22.6 21.2 20.5
17.3 16.2 16.6
21.4 23.7 23.2
20.9 22.2 22.6
14.5 10.5 12.3
20.8 19.1 21.3
17.4 18.6 18.6
25.1 24.8 24.9
14.9 16.3 16.6]

################################
### Starting values and test ###
################################
using Statistics

mu0 = mean(y)
sigma2_mu0 = var(mean(y, dims = 2))
sigma20 = mean(var(y, dims = 2))
theta0  = [mu0, sigma2_mu0, sigma20]

### test
one_factor_re_loglike(theta0)
###########
### Run ###
###########

using Optim
res = optimize(one_factor_re_loglike, theta0, LBFGS())
res

* Status: success

* Candidate solution
Final objective value:     6.226441e+01

* Found with
Algorithm:     L-BFGS

* Convergence measures
|x - x'|               = 2.93e-08 ≰ 0.0e+00
|x - x'|/|x'|          = 1.49e-09 ≰ 0.0e+00
|f(x) - f(x')|         = 1.42e-14 ≰ 0.0e+00
|f(x) - f(x')|/|f(x')| = 2.28e-16 ≰ 0.0e+00
|g(x)|                 = 1.17e-09 ≤ 1.0e-08

* Work counters
Seconds run:   1  (vs limit Inf)
Iterations:    7
f(x) calls:    22
∇f(x) calls:   22


Optim.minimizer(res)

3-element Array{Float64,1}:
19.599999980440952
12.193999992338886
1.1666666662195693
Optim.minimum(res)
62.30661224610756

#### Two-factor

using LinearAlgebra
using Statistics

function sfran2_loglike(par::Vector)
n = length(y)
mu = par

sigma2_alpha = exp(par)
sigma2_gamma = exp(par)
sigma2 = exp(par)

Sigma = sigma2*I(n) + sigma2_alpha*(Xalpha * Xalpha') + sigma2_gamma * (Xgamma * Xgamma')

l = -n/2*log(2*pi) - sum(log.(diag(cholesky(Sigma).L))) - .5*(y .- mu)' * (Sigma\(y .- mu))

l = -l
return l
end
##################
### Data setup ###
##################
y = [1.39,1.29,1.12,1.16,1.52,1.62,1.88,1.87,1.24,1.18,
.95,.96,.82,.92,1.18,1.20,1.47,1.41,1.57,1.65]

################################
### Starting values and test ###
################################
yhat = mean(reshape(y, 4, 5), 1)

theta0 = [mean(y), log(var(yhat)), log(var(y)/3), log(var(y)/3)]

sfran2_loglike(theta0)
###########
### Run ###
###########
using Optim
res = optimize(sfran2_loglike, theta0, method = :l_bfgs)
res

* Status: success

* Candidate solution
Final objective value:     -1.199315e+01

* Found with
Algorithm:     L-BFGS

* Convergence measures
|x - x'|               = 6.60e-09 ≰ 0.0e+00
|x - x'|/|x'|          = 1.08e-09 ≰ 0.0e+00
|f(x) - f(x')|         = 5.33e-15 ≰ 0.0e+00
|f(x) - f(x')|/|f(x')| = 4.44e-16 ≰ 0.0e+00
|g(x)|                 = 7.02e-10 ≤ 1.0e-08

* Work counters
Seconds run:   0  (vs limit Inf)
Iterations:    9
f(x) calls:    23
∇f(x) calls:   23
exp.(Optim.minimizer(res))

4-element Array{Float64,1}:
3.7434213772629223
0.053720000000540405
0.031790000003692476
0.002290000000530042
-2*Optim.minimum(res)
23.98630759443859

## Matlab Demos

I don’t code in Matlab, nor have any particular desire to, so this is provided here just for reference because I happened to have the code.

### Mixed Models

#### One-factor

% matlab from Statistical Modeling and Computation (2014 p 311).  See the
% associated twofactorRE.R file for details.

function one_factor_re_loglike(mu, sigma2_mu, sigma2, y)
[d ni] = size(y);
Sigmai = sigma2*eye(ni) + sigma2_mu*ones(ni,ni);
l = -(ni*d) / 2*log(2*pi) - d / 2*log(det(Sigmai));
for i=1:d
yi = y(i, :)';
l = l - .5*(yi - mu)' * (Sigmai\(yi - mu));
end
end

y = [22.6 20.5 20.8;
22.6 21.2 20.5;
17.3 16.2 16.6;
21.4 23.7 23.2;
20.9 22.2 22.6;
14.5 10.5 12.3;
20.8 19.1 21.3;
17.4 18.6 18.6;
25.1 24.8 24.9;
14.9 16.3 16.6];

f = @(theta) -one_factor_re_loglike(theta(1), theta(2), theta(3), y);
ybar = mean(y, 2);
theta0 = [mean(ybar) var(ybar) mean(var(y, 0, 2))];
thetahat = fminsearch(f, theta0);

#### Two-factor

% matlab from Statistical Modeling and Computation (2014 p 314). See the
% associated twofactorRE.R file for details.

function sfran2_loglike(mu, eta_alpha, eta_gamma, eta, y, Xalpha, Xgamma)
sigma2_alpha = exp(eta_alpha);
sigma2_gamma = exp(eta_gamma);
sigma2 = exp(eta);

n = length(y);

Sigma = sigma2*speye(n) + sigma2_alpha * (Xalpha * Xalpha') + sigma2_gamma * (Xgamma*Xgamma');

l = -n/2 * log(2*pi) - sum(log(diag(chol(Sigma)))) - .5*(y - mu)' * (Sigma\(y - mu));

end

y = [1.39 1.29 1.12 1.16 1.52 1.62 1.88 1.87 1.24 1.18 .95 .96 .82 .92 1.18 1.20 1.47 1.41 1.57 1.65];

Xalpha = kron(speye(5),  ones(4,1));

Xgamma = kron(speye(10), ones(2,1));

f = @(theta) -sfran_loglike(theta(1), theta(2), theta(3), theta(4), y, Xalpha, Xgamma);
yhat     = mean(reshape(y, 4, 5));
theta0   = [mean(y) log(var(yhat)) log(var(y)/3) log(var(y)/3)];
thetahat = fminsearch(f, theta0)

### Gaussian Processes

Any updates on the following can be found at the repo.

function S = gaussSample(arg1, arg2, arg3)
% Returns n samples (in the rows) from a multivariate Gaussian distribution
%
% Examples:
% S = gaussSample(mu, Sigma, 10)
% S = gaussSample(model, 100)
% S = gaussSample(struct('mu',, 'Sigma', eye(1)), 3)

% This file is from pmtk3.googlecode.com

switch nargin
case 3,  mu = arg1; Sigma = arg2; n = arg3;
case 2, model = arg1; mu = model.mu; Sigma = model.Sigma; n = arg2;
case 1, model = arg1; mu = model.mu; Sigma = model.Sigma; n = 1;
otherwise
end

A = chol(Sigma, 'lower');
Z = randn(length(mu), n);
S = bsxfun(@plus, mu(:), A*Z)';

end
%% Visualize the effect of change the hyper-params for a 1d GP regression
% based on demo_gpr by Carl Rasmussen
%
%% Generate data

% This file is from pmtk3.googlecode.com

n = 20;
rand('state',18);
randn('state',20);
covfunc = {'covSum', {'covSEiso','covNoise'}};
loghyper = [log(1.0); log(1.0); log(0.1)];
x = 15*(rand(n,1)-0.5);
y = chol(feval(covfunc{:}, loghyper, x))'*randn(n,1);        % Cholesky decomp.

xstar = linspace(-7.5, 7.5, 201)';

hyps = [log(1), log(1), log(0.1);...
log(0.3),log(1.08),log(0.00005);...
log(3),log(1.16),log(0.89)];

%% compute post pred and plot marginals
for i=1:size(hyps,1)
loghyper = hyps(i,:)';
[mu, S2] = gpr(loghyper, covfunc, x, y, xstar);
S2 = S2 - exp(2*loghyper(3)); % remove observation noise

figure;
f = [mu+2*sqrt(S2);flipdim(mu-2*sqrt(S2),1)];
fill([xstar; flipdim(xstar,1)], f, [7 7 7]/8, 'EdgeColor', [7 7 7]/8);
hold on
plot(xstar,mu,'k-','LineWidth',2);
plot(x, y, 'k+', 'MarkerSize', 17);
axis([-8 8 -3 3])
printPmtkFigure(sprintf('gprDemoChangeHparams%d', i));
end
%% Reproduce figure 2.2 from GP book
%
%%

% This file is from pmtk3.googlecode.com

setSeed(0);
L = 1;
xs = (-5:0.2:5)';
ns = length(xs);
keps = 1e-8;
muFn = @(x) 0*x(:).^2;
Kfn = @(x,z) 1*exp(-sq_dist(x'/L,z'/L)/2);

% plot sampled functions from the prior
figure; hold on
for i=1:3
model = struct('mu', muFn(xs), 'Sigma',  Kfn(xs, xs) + 1e-15*eye(size(xs, 1)));
fs = gaussSample(model, 1);
plot(xs, fs, 'k-', 'linewidth', 2)
end
printPmtkFigure('gprDemoNoiseFreePrior')

% generate noise-less training data
Xtrain = [-4, -3, -2, -1, 1]';
ftrain = sin(Xtrain);

% compute posterior predictive
K = Kfn(Xtrain, Xtrain); % K
Ks = Kfn(Xtrain, xs); %K_*
Kss = Kfn(xs, xs) + keps*eye(length(xs)); % K_** (keps is essential!)
Ki = inv(K);
postMu = muFn(xs) + Ks'*Ki*(ftrain - muFn(Xtrain));
postCov = Kss - Ks'*Ki*Ks;

figure; hold on
% plot marginal posterior variance as gray band
mu = postMu(:);
S2 = diag(postCov);
f = [mu+2*sqrt(S2);flipdim(mu-2*sqrt(S2),1)];
fill([xs; flipdim(xs,1)], f, [7 7 7]/8, 'EdgeColor', [7 7 7]/8);

% plot samples from posterior predictive
for i=1:3
model = struct('mu', postMu(:)', 'Sigma', postCov);
fs = gaussSample(model, 1);
plot(xs, fs, 'k-', 'linewidth', 2)
h=plot(Xtrain, ftrain, 'kx', 'markersize', 12, 'linewidth', 3);
end
printPmtkFigure('gprDemoNoiseFreePost')