gridcv - tecator - kNN-Lwplsr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (178, 100)
3×100 DataFrame
Row8508528548568588608628648668688708728748768788808828848868888908928948968989009029049069089109129149169189209229249269289309329349369389409429449469489509529549569589609629649669689709729749769789809829849869889909929949969981000100210041006100810101012101410161018102010221024102610281030103210341036103810401042104410461048
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12.617762.618142.618592.619122.619812.620712.621862.623342.625112.627222.629642.632452.635652.639332.643532.648252.65352.659372.665852.672812.680082.687332.694272.700732.706842.712812.719142.726282.734622.744162.754662.765682.776792.78792.799492.812252.827062.843562.861062.878572.894972.909242.920852.930152.938462.947712.960192.978313.003063.035063.074283.119633.168683.217713.262543.299883.328473.348993.363423.373793.381523.387413.391643.394183.39493.393663.390453.385413.378693.370413.360733.349793.337693.324433.310133.294873.278913.262323.245423.228283.21083.192873.174333.155033.134753.113393.091163.06853.045963.023933.002472.981452.960722.940132.919782.899662.879642.85962.83942.8192
22.834542.838712.842832.847052.851382.855872.86062.865662.870932.876612.882642.888982.895772.903082.910972.919532.928732.938632.949292.960722.972722.984932.99693.008333.01923.02993.041013.053453.067773.084163.102213.121063.139833.15813.176233.195193.215843.237473.258893.278353.293843.303623.306813.303933.2973.289253.284093.285053.293263.309233.332673.362513.396613.431883.464923.492953.514583.530043.540673.547973.553063.556753.559213.560453.560343.558763.555713.551323.545853.53953.532353.524423.515833.506683.4973.486833.476263.465523.455013.444813.434773.424653.414193.403033.390823.377313.362653.347453.332453.318183.304733.291863.279213.266553.253693.240453.226593.211813.1963.17942
32.582842.584582.586292.588082.589962.591922.594012.596272.598732.601312.604142.607142.610292.613612.617142.620892.624862.629092.633612.638352.64332.648382.653542.65872.663752.66882.673832.678922.684112.689372.69472.700122.705632.711412.717752.72492.733442.743272.754332.766422.779312.792722.806492.820642.835412.851212.868722.889052.912892.940882.973253.009463.04783.085543.119473.146963.166773.179383.186313.189243.18953.188013.184983.180393.174113.166113.156413.145123.132413.118433.103293.087143.070143.052373.033933.015042.995692.976122.956422.93662.916672.896552.876222.855632.834742.813612.792352.771132.750152.729562.709342.689512.670092.651122.632622.614612.597182.580342.564042.54816
Y = dat.Y 
@head Y
... (178, 4)
3×4 DataFrame
Rowwaterfatproteintyp
Float64Float64Float64String
160.522.516.7train
246.040.113.5train
371.08.420.5train
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
ntot = ntrain + ntest
(ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

CV-Segments for model tuning

Replicated K-fold CV

K = 3     # nb. folds (segments)
rep = 10   # nb. replications
segm = segmkf(ntrain, K; rep = rep)
10-element Vector{Vector{Vector{Int64}}}:
 [[2, 6, 7, 9, 16, 18, 19, 21, 30, 32  …  94, 96, 98, 99, 100, 107, 110, 111, 113, 114], [1, 4, 11, 17, 26, 28, 29, 37, 40, 41  …  93, 95, 101, 102, 104, 105, 106, 109, 112, 115], [3, 5, 8, 10, 12, 13, 14, 15, 20, 22  …  61, 71, 74, 76, 80, 81, 92, 97, 103, 108]]
 [[2, 3, 9, 10, 15, 18, 21, 23, 24, 25  …  88, 93, 94, 96, 98, 102, 104, 112, 114, 115], [7, 8, 12, 13, 14, 16, 26, 27, 29, 30  …  99, 100, 101, 103, 105, 106, 107, 109, 110, 113], [1, 4, 5, 6, 11, 17, 19, 20, 22, 32  …  79, 81, 83, 84, 85, 90, 91, 95, 108, 111]]
 [[2, 6, 11, 16, 18, 20, 22, 24, 25, 29  …  90, 97, 101, 103, 106, 108, 109, 110, 111, 113], [5, 8, 12, 13, 15, 21, 23, 26, 27, 35  …  82, 91, 94, 95, 98, 99, 102, 104, 114, 115], [1, 3, 4, 7, 9, 10, 14, 17, 19, 28  …  87, 88, 89, 92, 93, 96, 100, 105, 107, 112]]
 [[1, 5, 7, 10, 18, 21, 22, 23, 26, 29  …  86, 87, 94, 95, 97, 103, 106, 109, 111, 112], [2, 4, 8, 11, 12, 13, 15, 17, 19, 24  …  82, 89, 91, 92, 98, 99, 100, 107, 113, 114], [3, 6, 9, 14, 16, 20, 25, 28, 33, 34  …  90, 93, 96, 101, 102, 104, 105, 108, 110, 115]]
 [[3, 7, 13, 16, 21, 23, 26, 28, 31, 34  …  85, 86, 93, 98, 101, 104, 105, 106, 107, 114], [1, 2, 4, 5, 6, 8, 9, 11, 12, 14  …  83, 88, 91, 92, 96, 108, 109, 110, 111, 113], [10, 15, 17, 19, 20, 24, 29, 30, 33, 36  …  90, 94, 95, 97, 99, 100, 102, 103, 112, 115]]
 [[4, 5, 7, 14, 15, 16, 18, 19, 22, 23  …  83, 84, 89, 99, 100, 104, 105, 106, 107, 109], [1, 6, 9, 10, 11, 17, 28, 31, 33, 35  …  88, 90, 93, 94, 96, 97, 101, 102, 110, 112], [2, 3, 8, 12, 13, 20, 21, 24, 25, 26  …  91, 92, 95, 98, 103, 108, 111, 113, 114, 115]]
 [[2, 7, 15, 16, 20, 22, 23, 27, 29, 30  …  91, 92, 94, 95, 98, 101, 102, 106, 108, 115], [4, 11, 13, 14, 18, 19, 21, 24, 32, 33  …  84, 86, 96, 97, 103, 107, 110, 111, 112, 114], [1, 3, 5, 6, 8, 9, 10, 12, 17, 25  …  87, 89, 90, 93, 99, 100, 104, 105, 109, 113]]
 [[2, 7, 10, 12, 14, 16, 22, 24, 29, 30  …  96, 99, 100, 101, 104, 105, 106, 111, 112, 115], [1, 3, 4, 5, 11, 13, 15, 18, 21, 26  …  91, 92, 93, 94, 95, 97, 102, 103, 110, 113], [6, 8, 9, 17, 19, 20, 23, 25, 27, 28  …  77, 78, 84, 88, 89, 98, 107, 108, 109, 114]]
 [[2, 3, 6, 8, 13, 15, 16, 20, 24, 25  …  81, 90, 91, 92, 94, 104, 106, 109, 110, 115], [1, 4, 9, 10, 11, 12, 14, 17, 28, 30  …  95, 98, 99, 100, 102, 105, 111, 112, 113, 114], [5, 7, 18, 19, 21, 22, 23, 26, 31, 32  …  80, 85, 86, 87, 96, 97, 101, 103, 107, 108]]
 [[4, 8, 10, 14, 22, 24, 30, 33, 36, 37  …  95, 96, 98, 100, 104, 107, 108, 110, 112, 113], [3, 5, 6, 9, 12, 13, 17, 18, 19, 20  …  73, 78, 85, 88, 92, 93, 97, 103, 114, 115], [1, 2, 7, 11, 15, 16, 21, 23, 28, 29  …  86, 87, 91, 99, 101, 102, 105, 106, 109, 111]]

Grid-search

nlvdis = [5; 10; 15]; metric = [:mah]   # here only Mahalanobis distance is considered
h = [1; 2; 4; 6; Inf]; k = [30; 50; 100]  
nlv = 0:15
pars = mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) 
length(pars[1])
45
model = lwplsr()
rescv = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv)
@names rescv 
res = rescv.res
res_rep = rescv.res_rep
21600×8 DataFrame
21575 rows omitted
Rowrepsegmnlvdismetrichknlvy1
Int64Int64AnyAnyAnyAnyInt64Float64
1115mah1.03004.55673
2115mah1.03011.1839
3115mah1.03021.29055
4115mah1.03031.1621
5115mah1.03041.5221
6115mah1.03051.63031
7115mah1.03061.60902
8115mah1.03073.21498
9115mah1.03083.7293
10115mah1.03094.2155
11115mah1.030104.30623
12115mah1.030114.048
13115mah1.030123.60532
2158910315mahInf10042.23052
2159010315mahInf10052.16522
2159110315mahInf10062.11402
2159210315mahInf10072.05778
2159310315mahInf10082.01392
2159410315mahInf10091.81874
2159510315mahInf100101.80617
2159610315mahInf100111.83318
2159710315mahInf100121.90184
2159810315mahInf100132.08162
2159910315mahInf100142.05006
2160010315mahInf100152.00492
group = string.("nvldis=", res.nlvdis, " h=", res.h, " k=", res.k)
plotgrid(res.nlv, res.y1, group; step = 2, xlabel ="Nb. LVs", ylabel = "RMSEP-Val").f

Selection of the best parameter combination

u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
DataFrameRow (6 columns)
Rownlvnlvdismetrichky1
Int64AnyAnyAnyAnyFloat64
325mah1.0301.15901

Final prediction (Test) using the optimal model

model = lwplsr(nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], 
    k = res.k[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain)
pred = predict(model, Xtest).pred
63×1 Matrix{Float64}:
 29.233919470654392
  1.3973931780971185
  4.794708197493841
 11.214014955076081
 15.40594497215125
 20.431938204221815
 27.011452750213174
 47.98443636636166
  5.256365240224742
  1.869392428285745
  ⋮
 17.70770463351476
 19.115522708954746
 21.597608245182037
 27.614157967846275
 27.667973750722332
 29.66398089648304
 33.922975891981565
 36.54196651755939
 41.810372978453216

Generalization error

rmsep(pred, ytest)
1×1 Matrix{Float64}:
 1.0303218875423956

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Prediction", ylabel = "Observed").f