gridcv - forages2 - Pls-Qda

using Jchemo, JchemoData
using JLD2, CairoMakie
using FreqTables

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/forages2.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X 
@head X
... (485, 700)
3×700 DataFrame
600 columns omitted
Row1100110211041106110811101112111411161118112011221124112611281130113211341136113811401142114411461148115011521154115611581160116211641166116811701172117411761178118011821184118611881190119211941196119812001202120412061208121012121214121612181220122212241226122812301232123412361238124012421244124612481250125212541256125812601262126412661268127012721274127612781280128212841286128812901292129412961298
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
1-0.000231591-0.000175945-8.48176e-52.05217e-50.0001100940.0001617570.0001549530.0001637540.0001876020.000214990.0002424790.0002654980.0002821410.0002814420.0002710250.0002610750.0002572840.0002521770.000242930.0002282950.0002190970.0002141360.0002156120.0002189820.0002280040.0002360810.0002360170.0002203270.0001870960.0001371387.68593e-51.13679e-5-5.00951e-5-9.54664e-5-0.000119199-0.000131897-0.000142349-0.000161489-0.00019387-0.000244808-0.000303259-0.000366904-0.000416738-0.000451535-0.00046995-0.000478637-0.000477348-0.000478142-0.000476719-0.000479701-0.000482037-0.000496769-0.000511959-0.000532094-0.000542661-0.000540188-0.000512715-0.00045798-0.000370395-0.000256256-0.0001269071.13716e-60.0001190470.0002127450.0002756850.0003078630.0003135470.0002969770.0002696610.0002478180.0002339440.0002287730.0002245670.0002212560.0002188930.0002177410.0002101440.000196640.0001819490.0001697740.0001516910.000123859.23378e-55.9959e-52.58352e-5-4.77314e-6-3.21835e-5-5.53154e-5-6.71707e-5-6.54166e-5-5.16448e-5-2.43366e-51.12255e-54.68917e-57.773e-50.0001067850.0001331730.0001536070.0001685180.000182591
2-9.66352e-5-3.30928e-55.64966e-50.0001541350.0002377250.0002957890.0003195870.0003574050.0004046110.0004479960.0004797860.0004883390.0004659290.0004023010.0003136480.0002202260.0001384837.35084e-53.50018e-52.83293e-56.05478e-50.0001182720.0001877260.0002498420.000296970.0003150620.0002988280.0002516430.0001870550.0001182435.60849e-53.8727e-6-3.28778e-5-4.84688e-5-4.38912e-5-3.34954e-5-2.72637e-5-3.65483e-5-6.62949e-5-0.000121833-0.000193587-0.000280244-0.000362132-0.000434981-0.000494461-0.000546531-0.000590606-0.000638514-0.000684688-0.000734688-0.000783664-0.000842714-0.000892596-0.000930301-0.000938118-0.000913585-0.000846217-0.000737781-0.000588122-0.000410395-0.000220611-3.69382e-50.0001310720.0002660780.0003583770.0004086840.0004245280.0004121470.0003838960.0003579570.0003383850.0003267490.0003155720.000305420.0002936710.0002800050.0002594820.0002336970.00020440.0001771990.0001479890.0001123257.33317e-53.48779e-5-2.5229e-6-3.27922e-5-5.52233e-5-7.06412e-5-7.49675e-5-6.44041e-5-4.04393e-5-6.50489e-63.09196e-56.87358e-50.0001052020.0001423130.0001771820.0002066520.0002307880.000253703
3-0.000131769-7.8398e-57.92223e-78.90044e-50.0001600220.0001984350.0001965980.0002122250.0002411090.0002712350.0003010450.0003249210.0003376190.0003258570.000299790.0002771670.000270180.000271650.0002776060.0002877220.0003082030.0003248470.0003285730.0003108060.000277280.0002268980.0001604748.30948e-57.98825e-6-5.32827e-5-9.57157e-5-0.000123438-0.0001371-0.000134382-0.00011527-9.07963e-5-6.97458e-5-6.29138e-5-7.14491e-5-9.85941e-5-0.000137562-0.000192678-0.000248177-0.000303993-0.000356125-0.000407616-0.0004553-0.000507819-0.000555473-0.000603436-0.000647099-0.000701763-0.000754429-0.000806879-0.000838493-0.000842167-0.000803445-0.000720829-0.000592138-0.000428566-0.000245567-6.43964e-50.0001011930.0002322420.0003221330.0003736050.0003918170.0003793320.0003478290.0003164950.0002922360.0002784310.0002646210.0002503050.0002393870.0002345040.0002246330.0002056840.0001804080.0001576150.0001351080.0001068717.3258e-53.90321e-57.34127e-6-1.78231e-5-3.94282e-5-5.6427e-5-6.15935e-5-5.19038e-5-2.96367e-53.09722e-63.98752e-57.62892e-50.0001082710.0001376320.0001656240.0001911820.0002115860.000229586
Y = dat.Y
@head Y
... (485, 4)
3×4 DataFrame
Rowdmndftyptest
Float64?Float64?StringInt64
192.2337.58Legume forages1
293.2649.6462Legume forages0
392.963.2939Forage trees0
y = Y.typ   # response variable (class membership)
test = Y.test
tab(y)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "Cereal and grass forages" => 160
  "Forage trees"             => 101
  "Legume forages"           => 224
freqtable(y, test)
3×2 Named Matrix{Int64}
             Dim1 ╲ Dim2 │   0    1
─────────────────────────┼─────────
Cereal and grass forages │ 100   60
Forage trees             │  56   45
Legume forages           │ 167   57
wlst = names(X)
wl = parse.(Int, wlst)
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
700-element Vector{Int64}:
 1100
 1102
 1104
 1106
 1108
 1110
 1112
 1114
 1116
 1118
    ⋮
 2482
 2484
 2486
 2488
 2490
 2492
 2494
 2496
 2498

Note:: X-data are already preprocessed (SNV + Savitsky-Golay 2nd deriv).

Split Tot to Train/Test

The model is fitted on Train, and the generalization error is estimated on Test. In this example, Train is already defined in variable typ of the dataset, and Test is defined by the remaining samples. But Tot could also be split a posteriori, for instance by sampling (random, systematic or any other designs). See for instance functions samprand, sampsys, etc.

s = Bool.(test)
Xtrain = rmrow(X, s)
ytrain = rmrow(y, s)
Xtest = X[s, :]
ytest = y[s]
ntot = nro(X)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)
(ntot = 485, ntrain = 323, ntest = 162)
tab(ytrain)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "Cereal and grass forages" => 100
  "Forage trees"             => 56
  "Legume forages"           => 167
tab(ytest)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "Cereal and grass forages" => 60
  "Forage trees"             => 45
  "Legume forages"           => 57

Replicated K-fold CV

K = 3     # nb. folds (segments)
rep = 25  # nb. replications
segm = segmkf(ntrain, K; rep = rep)
25-element Vector{Vector{Vector{Int64}}}:
 [[1, 11, 16, 17, 24, 27, 28, 32, 34, 36  …  303, 308, 309, 311, 313, 315, 316, 318, 319, 323], [5, 7, 8, 9, 13, 15, 18, 19, 20, 21  …  280, 281, 282, 290, 291, 304, 305, 306, 307, 321], [2, 3, 4, 6, 10, 12, 14, 25, 30, 31  …  297, 299, 300, 301, 310, 312, 314, 317, 320, 322]]
 [[1, 3, 4, 5, 6, 10, 11, 13, 15, 23  …  286, 288, 290, 293, 294, 295, 296, 303, 313, 314], [7, 16, 17, 18, 19, 20, 21, 24, 30, 31  …  298, 299, 300, 304, 312, 315, 317, 319, 322, 323], [2, 8, 9, 12, 14, 22, 25, 26, 27, 29  …  306, 307, 308, 309, 310, 311, 316, 318, 320, 321]]
 [[1, 2, 4, 5, 7, 9, 10, 13, 22, 25  …  298, 299, 302, 303, 306, 314, 317, 318, 322, 323], [3, 11, 16, 17, 18, 19, 20, 21, 23, 27  …  293, 300, 301, 304, 307, 309, 312, 315, 320, 321], [6, 8, 12, 14, 15, 24, 28, 29, 34, 36  …  295, 296, 297, 305, 308, 310, 311, 313, 316, 319]]
 [[5, 6, 7, 8, 14, 15, 24, 27, 28, 30  …  283, 284, 293, 306, 308, 311, 317, 319, 322, 323], [2, 3, 9, 13, 20, 21, 26, 29, 33, 34  …  298, 302, 303, 307, 309, 313, 315, 316, 318, 320], [1, 4, 10, 11, 12, 16, 17, 18, 19, 22  …  295, 299, 300, 301, 304, 305, 310, 312, 314, 321]]
 [[3, 5, 6, 9, 12, 13, 18, 20, 24, 27  …  294, 300, 304, 306, 312, 313, 314, 315, 317, 321], [1, 2, 4, 7, 8, 11, 15, 22, 23, 29  …  295, 298, 301, 303, 310, 311, 318, 319, 320, 322], [10, 14, 16, 17, 19, 21, 25, 26, 28, 30  …  296, 297, 299, 302, 305, 307, 308, 309, 316, 323]]
 [[7, 14, 15, 16, 18, 20, 25, 28, 32, 35  …  297, 300, 301, 308, 311, 312, 314, 315, 317, 322], [5, 6, 8, 10, 13, 17, 19, 22, 23, 24  …  295, 296, 298, 299, 305, 306, 309, 310, 313, 323], [1, 2, 3, 4, 9, 11, 12, 21, 27, 29  …  293, 302, 303, 304, 307, 316, 318, 319, 320, 321]]
 [[1, 4, 6, 11, 15, 18, 26, 29, 39, 42  …  295, 297, 301, 306, 307, 310, 311, 318, 320, 322], [2, 7, 12, 13, 17, 23, 25, 30, 31, 33  …  284, 288, 289, 293, 300, 302, 305, 312, 319, 321], [3, 5, 8, 9, 10, 14, 16, 19, 20, 21  …  303, 304, 308, 309, 313, 314, 315, 316, 317, 323]]
 [[1, 15, 23, 25, 26, 27, 30, 31, 34, 36  …  301, 303, 307, 309, 313, 314, 318, 320, 321, 322], [2, 5, 7, 9, 12, 13, 17, 18, 19, 21  …  292, 294, 298, 299, 310, 311, 312, 315, 319, 323], [3, 4, 6, 8, 10, 11, 14, 16, 20, 24  …  291, 293, 295, 302, 304, 305, 306, 308, 316, 317]]
 [[2, 7, 11, 14, 17, 21, 26, 31, 35, 37  …  300, 303, 304, 308, 309, 311, 312, 314, 315, 318], [4, 6, 9, 10, 15, 16, 20, 25, 28, 29  …  295, 299, 302, 310, 313, 316, 319, 321, 322, 323], [1, 3, 5, 8, 12, 13, 18, 19, 22, 23  …  275, 287, 296, 297, 301, 305, 306, 307, 317, 320]]
 [[4, 6, 13, 16, 18, 26, 33, 34, 35, 36  …  273, 275, 279, 282, 293, 298, 299, 301, 319, 323], [3, 5, 7, 15, 19, 20, 24, 27, 29, 30  …  304, 306, 307, 309, 310, 313, 314, 315, 318, 321], [1, 2, 8, 9, 10, 11, 12, 14, 17, 21  …  302, 303, 305, 308, 311, 312, 316, 317, 320, 322]]
 ⋮
 [[1, 2, 5, 6, 7, 9, 11, 12, 13, 15  …  297, 298, 301, 305, 308, 315, 316, 319, 321, 322], [3, 8, 14, 16, 17, 18, 19, 22, 23, 24  …  295, 300, 306, 310, 311, 312, 313, 318, 320, 323], [4, 10, 25, 26, 27, 32, 33, 35, 41, 43  …  294, 296, 299, 302, 303, 304, 307, 309, 314, 317]]
 [[2, 3, 6, 8, 12, 18, 19, 27, 44, 46  …  290, 297, 299, 302, 305, 306, 307, 313, 315, 320], [4, 5, 7, 10, 11, 13, 15, 17, 21, 24  …  300, 301, 304, 308, 312, 314, 317, 318, 321, 323], [1, 9, 14, 16, 20, 22, 23, 25, 26, 28  …  292, 293, 298, 303, 309, 310, 311, 316, 319, 322]]
 [[1, 2, 5, 14, 17, 19, 21, 22, 24, 27  …  290, 292, 295, 297, 305, 309, 311, 314, 316, 317], [3, 4, 10, 13, 16, 18, 20, 23, 25, 31  …  303, 304, 306, 310, 312, 313, 315, 318, 320, 322], [6, 7, 8, 9, 11, 12, 15, 26, 28, 29  …  289, 291, 294, 300, 302, 307, 308, 319, 321, 323]]
 [[1, 13, 19, 21, 24, 25, 27, 30, 38, 39  …  292, 294, 296, 297, 299, 312, 314, 315, 321, 322], [9, 10, 12, 15, 18, 20, 28, 32, 36, 37  …  300, 301, 302, 306, 307, 309, 310, 318, 319, 320], [2, 3, 4, 5, 6, 7, 8, 11, 14, 16  …  295, 303, 304, 305, 308, 311, 313, 316, 317, 323]]
 [[6, 7, 11, 13, 14, 15, 16, 18, 22, 23  …  310, 312, 314, 317, 318, 319, 320, 321, 322, 323], [2, 3, 4, 8, 9, 10, 17, 19, 21, 30  …  293, 295, 296, 297, 300, 305, 308, 313, 315, 316], [1, 5, 12, 20, 24, 27, 29, 33, 43, 45  …  280, 281, 282, 287, 289, 290, 301, 302, 304, 311]]
 [[3, 6, 10, 12, 18, 19, 21, 26, 28, 30  …  289, 290, 295, 296, 299, 301, 310, 314, 316, 321], [1, 2, 5, 9, 13, 25, 32, 39, 42, 48  …  303, 304, 305, 306, 308, 309, 311, 312, 313, 323], [4, 7, 8, 11, 14, 15, 16, 17, 20, 22  …  283, 287, 298, 307, 315, 317, 318, 319, 320, 322]]
 [[2, 3, 8, 10, 20, 23, 27, 31, 32, 36  …  292, 300, 304, 306, 309, 310, 313, 316, 321, 322], [5, 7, 9, 11, 14, 18, 28, 30, 33, 38  …  298, 301, 305, 307, 308, 312, 314, 317, 319, 320], [1, 4, 6, 12, 13, 15, 16, 17, 19, 21  …  281, 285, 297, 299, 302, 303, 311, 315, 318, 323]]
 [[2, 3, 5, 6, 9, 11, 12, 17, 19, 20  …  297, 298, 299, 302, 311, 312, 317, 319, 320, 321], [1, 8, 14, 18, 21, 25, 29, 30, 31, 32  …  301, 303, 305, 306, 307, 308, 313, 316, 322, 323], [4, 7, 10, 13, 15, 16, 22, 24, 28, 34  …  286, 288, 294, 295, 304, 309, 310, 314, 315, 318]]
 [[3, 4, 7, 11, 14, 15, 16, 19, 25, 28  …  288, 290, 292, 294, 296, 313, 315, 317, 319, 321], [6, 8, 9, 12, 13, 21, 22, 27, 31, 35  …  303, 304, 306, 307, 308, 312, 316, 320, 322, 323], [1, 2, 5, 10, 17, 18, 20, 23, 24, 26  …  286, 299, 301, 302, 305, 309, 310, 311, 314, 318]]
prior = [:unif]
alpha = [0, .25, .5, .75, 1]  # continuum parameter: from alpha = 0 (PLS-QDA) to alpha = 1 (PLS-LDA)
nlv = 1:20    # here must be > 0
pars = mpar(prior = prior, alpha = alpha)
model = plsqda()
res = gridcv(model, Xtrain, ytrain; segm, score = merrp, pars, nlv).res
100×4 DataFrame
75 rows omitted
Rownlvprioralphay1
Int64AnyAnyFloat64
11unif0.00.230875
22unif0.00.20929
33unif0.00.201239
44unif0.00.161668
55unif0.00.145473
66unif0.00.120697
77unif0.00.107131
88unif0.00.0973886
99unif0.00.097578
1010unif0.00.10514
1111unif0.00.108698
1212unif0.00.11286
1313unif0.00.11944
899unif1.00.120126
9010unif1.00.122039
9111unif1.00.118398
9212unif1.00.119923
9313unif1.00.12054
9414unif1.00.121795
9515unif1.00.121102
9616unif1.00.123126
9717unif1.00.12237
9818unif1.00.122119
9919unif1.00.120711
10020unif1.00.120156
plotgrid(res.nlv, res.y1, res.alpha; step = 2, xlabel = "Nb. LVs", ylabel = "ERRP-CV", leg_title = "Continuum").f

Selection of the best parameter combination

u = findall(res.y1 .== minimum(res.y1))[1] 
res[u, :]
DataFrameRow (4 columns)
Rownlvprioralphay1
Int64AnyAnyFloat64
288unif0.250.096919

Final prediction (Test) using the optimal model

model = plsqda(prior = res.prior[u], alpha = res.alpha[u], nlv = res.nlv[u])
fit!(model, Xtrain, ytrain)
pred = predict(model, Xtest).pred
162×1 Matrix{String}:
 "Legume forages"
 "Cereal and grass forages"
 "Cereal and grass forages"
 "Legume forages"
 "Cereal and grass forages"
 "Cereal and grass forages"
 "Legume forages"
 "Forage trees"
 "Forage trees"
 "Legume forages"
 ⋮
 "Cereal and grass forages"
 "Cereal and grass forages"
 "Forage trees"
 "Forage trees"
 "Legume forages"
 "Legume forages"
 "Legume forages"
 "Cereal and grass forages"
 "Legume forages"

Generalization error

errp(pred, ytest)
1×1 Matrix{Float64}:
 0.11728395061728394
merrp(pred, ytest)
1×1 Matrix{Float64}:
 0.11744639376218323
cf = conf(pred, ytest)
@names cf
(:cnt, :pct, :A, :Apct, :diagpct, :accpct, :lev)
cf.cnt
3×4 DataFrame
Rowypred_Cereal and grass foragespred_Forage treespred_Legume forages
StringInt64Int64Int64
1Cereal and grass forages5523
2Forage trees0405
3Legume forages3648
cf.pct
3×4 DataFrame
Rowlevelspred_Cereal and grass foragespred_Forage treespred_Legume forages
StringFloat64Float64Float64
1Cereal and grass forages91.73.35.0
2Forage trees0.088.911.1
3Legume forages5.310.584.2