gridcv - tecator - All models

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
X = dat.X
Y = dat.Y 
ntot, p = size(X)
typ = Y.typ
tab(typ)
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]

CV-Segments for model tuning

Replicated K-fold CV

K = 3     # nb. folds (segments)
rep = 10   # nb. replications
segm = segmkf(ntrain, K; rep = rep)

Grid-search

Plsr

nlv = 0:20
model = plskern()  
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, nlv).res 
plotgrid(res.nlv, res.y1; step = 2, xlabel ="Nb. LVs", ylabel = "RMSEP").f
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :] 
model = plskern(nlv = res.nlv[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)
pars = mpar(scal = [false; true])
nlv = 0:20
model = plskern()  
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv).res 
plotgrid(res.nlv, res.y1, res.scal; step = 2, xlabel ="Nb. LVs", ylabel = "RMSEP").f
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :] 
model = plskern(nlv = res.nlv[u], scal = res.scal[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Plsr-Avg

nlv = [0:10, 0:20]
pars = mpar(nlv = nlv)
length(pars[1])
model = plsravg()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars).res 
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :] 
model = plsravg(nlv = res.nlv[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Rr

lb = 10.0.^(-15:.1:3)
model = rr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, lb).res 
loglb = round.(log.(10, res.lb), digits = 3)
plotgrid(loglb, res.y1; step = 2, xlabel ="Lambda", ylabel = "RMSEP").f
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = rr(lb = res.lb[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)
pars = mpar(scal = [false; true])
lb = 10.0.^(-15:.1:3)
model = rr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, lb).res
loglb = round.(log.(10, res.lb), digits = 3)
plotgrid(loglb, res.y1, res.scal; step = 2, xlabel ="lambda (log scale)", ylabel = "RMSEP-Val").f
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = rr(lb = res.lb[u], scal = res.scal[u]) 
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Covselr

nlv = 0:20
pars = mpar(meth = [:hard], nvar = [1])
model = splsr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv).res
plotgrid(res.nlv, res.y1; step = 2, xlabel ="Lambda", ylabel = "RMSEP").f
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = splsr(meth = res.meth[u], nvar = res.nvar[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Krr

lb = 10.0.^(-15:5) 
gamma = 10.0.^(-3:5) 
pars = mpar(gamma = gamma) 
length(pars[1])
model = krr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, lb).res
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = krr(lb = res.lb[u], gamma = res.gamma[u]) 
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Kplsr

nlv = 0:30 
gamma = 10.0.^(-3:5)
pars = mpar(gamma = gamma)
length(pars[1])
model = kplsr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv).res
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :]
model = kplsr(nlv = res.nlv[u], gamma = res.gamma[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Dkplsr

nlv = 0:30 
gamma = 10.0.^(-3:5)
pars = mpar(gamma = gamma)
length(pars[1])
model = dkplsr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv).res
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :]
model = dkplsr(nlv = res.nlv[u], gamma = res.gamma[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

kNN-Lwplsr

nlvdis = [10; 15]; metric = [:mah] 
h = [1; 2; 4; 6; Inf]
k = [30; 50; 100]  
nlv = 0:15 
pars = mpar(nlvdis = nlvdis, metric = metric, h = h, k = k)
length(pars[1])
model = lwplsr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv).res 
group = string.(res.nlvdis, "-", res.h, "-", res.k) 
plotgrid(res.nlv, res.y1, group; xlabel ="Nb. LVs", ylabel = "RMSEP").f
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = lwplsr(nlvdis = res.nlvdis[u], metric = res.metric[u], 
    h = res.h[u], k = res.k[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain)
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

kNN-Lwplsr-Avg

nlvdis = [10; 15]; metric = [:mah] 
h = [1; 2; 5; Inf]; k = [30; 50; 100]  
nlv = [0:5, 0:10, 1:5, 1:10]
pars = mpar(nlv = nlv, nlvdis = nlvdis, metric = metric, h = h, k = k) 
length(pars[1])
model = lwplsravg() 
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars).res
u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
model = lwplsravg(nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], 
    k = res.k[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain)
pred = predict(model, Xtest).pred
rmsep(pred, ytest)

Knnr

nlvdis = [15; 20]; metric = [:eucl, :mah] 
h = [1; 2; 4; 6; Inf] 
k = [1; collect(5:10:50)] 
pars = mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) 
length(pars[1])
model = knnr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars).res
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :]
model = knnr(nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], 
    k = res.k[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Rfr

n_trees = [200]
n_subfeatures = LinRange(10, p, 10)
max_depth = [6; 10; 20; 50; 2000]
model = rfr()
pars = mpar(n_trees = n_trees, n_subfeatures = n_subfeatures, max_depth = max_depth)
length(pars[1])
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars).res 
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :]
model = rfr(n_trees = res.n_trees[u], n_subfeatures = res.n_subfeatures[u], 
    max_depth = res.max_depth[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)

Svmr

kern = [:krbf]  # Gaussian kernel
cost = 10.0.^(-4:4)
epsilon = (.1, .25)
gamma = 10.0.^(-3:3)
pars = mpar(kern = kern, cost = cost, epsilon = epsilon, gamma = gamma)
length(pars[1])
model = svmr()
res = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars).res 
u = findall(res.y1 .== minimum(res.y1))[1]
res[u, :]
model = svmr(kern = res.kern[u], cost = res.cost[u], epsilon = res.epsilon[u], gamma = res.gamma[u])
fit!(model, Xtrain, ytrain) 
pred = predict(model, Xtest).pred 
rmsep(pred, ytest)