gridcv - tecator - kNN-Lwplsr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (178, 100)
3×100 DataFrame
Row8508528548568588608628648668688708728748768788808828848868888908928948968989009029049069089109129149169189209229249269289309329349369389409429449469489509529549569589609629649669689709729749769789809829849869889909929949969981000100210041006100810101012101410161018102010221024102610281030103210341036103810401042104410461048
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12.617762.618142.618592.619122.619812.620712.621862.623342.625112.627222.629642.632452.635652.639332.643532.648252.65352.659372.665852.672812.680082.687332.694272.700732.706842.712812.719142.726282.734622.744162.754662.765682.776792.78792.799492.812252.827062.843562.861062.878572.894972.909242.920852.930152.938462.947712.960192.978313.003063.035063.074283.119633.168683.217713.262543.299883.328473.348993.363423.373793.381523.387413.391643.394183.39493.393663.390453.385413.378693.370413.360733.349793.337693.324433.310133.294873.278913.262323.245423.228283.21083.192873.174333.155033.134753.113393.091163.06853.045963.023933.002472.981452.960722.940132.919782.899662.879642.85962.83942.8192
22.834542.838712.842832.847052.851382.855872.86062.865662.870932.876612.882642.888982.895772.903082.910972.919532.928732.938632.949292.960722.972722.984932.99693.008333.01923.02993.041013.053453.067773.084163.102213.121063.139833.15813.176233.195193.215843.237473.258893.278353.293843.303623.306813.303933.2973.289253.284093.285053.293263.309233.332673.362513.396613.431883.464923.492953.514583.530043.540673.547973.553063.556753.559213.560453.560343.558763.555713.551323.545853.53953.532353.524423.515833.506683.4973.486833.476263.465523.455013.444813.434773.424653.414193.403033.390823.377313.362653.347453.332453.318183.304733.291863.279213.266553.253693.240453.226593.211813.1963.17942
32.582842.584582.586292.588082.589962.591922.594012.596272.598732.601312.604142.607142.610292.613612.617142.620892.624862.629092.633612.638352.64332.648382.653542.65872.663752.66882.673832.678922.684112.689372.69472.700122.705632.711412.717752.72492.733442.743272.754332.766422.779312.792722.806492.820642.835412.851212.868722.889052.912892.940882.973253.009463.04783.085543.119473.146963.166773.179383.186313.189243.18953.188013.184983.180393.174113.166113.156413.145123.132413.118433.103293.087143.070143.052373.033933.015042.995692.976122.956422.93662.916672.896552.876222.855632.834742.813612.792352.771132.750152.729562.709342.689512.670092.651122.632622.614612.597182.580342.564042.54816
Y = dat.Y 
@head Y
... (178, 4)
3×4 DataFrame
Rowwaterfatproteintyp
Float64Float64Float64String
160.522.516.7train
246.040.113.5train
371.08.420.5train
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
ntot = ntrain + ntest
(ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

CV-Segments for model tuning

Replicated K-fold CV

K = 3     # nb. folds (segments)
rep = 10   # nb. replications
segm = segmkf(ntrain, K; rep = rep)
10-element Vector{Vector{Vector{Int64}}}:
 [[1, 2, 3, 4, 5, 6, 13, 19, 20, 21  …  90, 91, 93, 99, 102, 104, 107, 109, 110, 115], [7, 8, 11, 15, 16, 32, 36, 37, 39, 40  …  84, 86, 92, 94, 95, 96, 100, 103, 108, 113], [9, 10, 12, 14, 17, 18, 22, 24, 26, 27  …  85, 87, 97, 98, 101, 105, 106, 111, 112, 114]]
 [[1, 4, 8, 15, 17, 19, 21, 23, 25, 26  …  87, 92, 93, 94, 99, 105, 107, 110, 112, 115], [2, 9, 10, 12, 13, 16, 20, 27, 29, 33  …  82, 83, 84, 96, 100, 102, 109, 111, 113, 114], [3, 5, 6, 7, 11, 14, 18, 22, 24, 28  …  90, 91, 95, 97, 98, 101, 103, 104, 106, 108]]
 [[2, 4, 9, 10, 11, 17, 21, 23, 25, 26  …  83, 84, 86, 89, 91, 102, 104, 106, 110, 115], [5, 6, 7, 13, 14, 18, 19, 22, 24, 30  …  96, 98, 99, 101, 103, 105, 107, 108, 112, 114], [1, 3, 8, 12, 15, 16, 20, 27, 28, 31  …  85, 90, 93, 94, 95, 97, 100, 109, 111, 113]]
 [[2, 4, 5, 7, 8, 10, 13, 15, 16, 17  …  95, 96, 97, 98, 100, 101, 103, 110, 111, 114], [11, 14, 18, 21, 22, 23, 25, 26, 31, 32  …  72, 77, 80, 82, 86, 99, 105, 108, 113, 115], [1, 3, 6, 9, 12, 19, 27, 30, 35, 44  …  89, 91, 92, 94, 102, 104, 106, 107, 109, 112]]
 [[5, 10, 11, 12, 13, 16, 21, 22, 24, 27  …  89, 93, 94, 98, 102, 103, 104, 106, 111, 114], [2, 7, 9, 18, 29, 39, 40, 42, 43, 46  …  84, 91, 92, 99, 105, 107, 109, 110, 113, 115], [1, 3, 4, 6, 8, 14, 15, 17, 19, 20  …  85, 88, 90, 95, 96, 97, 100, 101, 108, 112]]
 [[1, 6, 7, 9, 16, 17, 18, 22, 27, 30  …  91, 92, 95, 96, 100, 101, 103, 109, 111, 115], [2, 4, 5, 10, 11, 12, 13, 19, 21, 23  …  75, 87, 93, 97, 99, 104, 106, 107, 108, 110], [3, 8, 14, 15, 20, 25, 28, 29, 34, 36  …  85, 89, 90, 94, 98, 102, 105, 112, 113, 114]]
 [[2, 4, 11, 12, 17, 20, 21, 23, 25, 29  …  92, 93, 94, 95, 97, 98, 101, 102, 105, 115], [1, 7, 8, 14, 15, 16, 24, 28, 31, 34  …  86, 87, 91, 100, 103, 106, 107, 108, 110, 114], [3, 5, 6, 9, 10, 13, 18, 19, 22, 26  …  84, 85, 88, 96, 99, 104, 109, 111, 112, 113]]
 [[2, 8, 10, 12, 16, 17, 18, 21, 26, 29  …  87, 91, 93, 95, 98, 99, 101, 104, 111, 112], [4, 5, 9, 11, 13, 20, 27, 32, 35, 40  …  80, 83, 88, 92, 94, 96, 100, 102, 103, 110], [1, 3, 6, 7, 14, 15, 19, 22, 23, 24  …  90, 97, 105, 106, 107, 108, 109, 113, 114, 115]]
 [[2, 12, 14, 18, 19, 23, 25, 26, 27, 28  …  89, 92, 93, 104, 106, 107, 108, 109, 111, 114], [1, 3, 4, 8, 9, 11, 13, 15, 17, 24  …  90, 91, 95, 96, 99, 101, 102, 105, 112, 113], [5, 6, 7, 10, 16, 20, 21, 22, 30, 31  …  76, 79, 84, 94, 97, 98, 100, 103, 110, 115]]
 [[2, 15, 19, 22, 24, 25, 26, 27, 29, 33  …  91, 92, 97, 98, 99, 103, 105, 107, 113, 115], [3, 8, 9, 11, 12, 13, 14, 17, 20, 23  …  78, 84, 89, 93, 95, 100, 102, 104, 106, 110], [1, 4, 5, 6, 7, 10, 16, 18, 21, 32  …  87, 88, 94, 96, 101, 108, 109, 111, 112, 114]]

Grid-search

nlvdis = [5; 10; 15]; metric = [:mah]   # here only Mahalanobis distance is considered
h = [1; 2; 4; 6; Inf]; k = [30; 50; 100]  
nlv = 0:15
pars = mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) 
length(pars[1])
45
model = lwplsr()
rescv = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv)
@names rescv 
res = rescv.res
res_rep = rescv.res_rep
21600×8 DataFrame
21575 rows omitted
Rowrepsegmnlvdismetrichknlvy1
Int64Int64AnyAnyAnyAnyInt64Float64
1115mah1.03004.60253
2115mah1.03011.16489
3115mah1.03021.01512
4115mah1.03030.901077
5115mah1.03040.750975
6115mah1.03050.947597
7115mah1.03061.05298
8115mah1.03071.11293
9115mah1.03081.3138
10115mah1.03091.34155
11115mah1.030101.22738
12115mah1.030111.27187
13115mah1.030121.8322
2158910315mahInf10042.12622
2159010315mahInf10052.07438
2159110315mahInf10062.10981
2159210315mahInf10072.40788
2159310315mahInf10082.43489
2159410315mahInf10092.32062
2159510315mahInf100102.59221
2159610315mahInf100112.56291
2159710315mahInf100122.71224
2159810315mahInf100133.1393
2159910315mahInf100143.08589
2160010315mahInf100153.09054
group = string.("nvldis=", res.nlvdis, " h=", res.h, " k=", res.k)
plotgrid(res.nlv, res.y1, group; step = 2, xlabel ="Nb. LVs", ylabel = "RMSEP-Val").f

Selection of the best parameter combination

u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
DataFrameRow (6 columns)
Rownlvnlvdismetrichky1
Int64AnyAnyAnyAnyFloat64
325mah1.0301.18309

Final prediction (Test) using the optimal model

model = lwplsr(nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain)
pred = predict(model, Xtest).pred
63×1 Matrix{Float64}:
 29.233919470654392
  1.3973931780971185
  4.794708197493841
 11.214014955076081
 15.40594497215125
 20.431938204221815
 27.011452750213174
 47.98443636636166
  5.256365240224742
  1.869392428285745
  ⋮
 17.70770463351476
 19.115522708954746
 21.597608245182037
 27.614157967846275
 27.667973750722332
 29.66398089648304
 33.922975891981565
 36.54196651755939
 41.810372978453216

Generalization error

rmsep(pred, ytest)
1×1 Matrix{Float64}:
 1.0303218875423956

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = "Test set - variable $nam", 
    xlabel = "Prediction", ylabel = "Observed").f