Fda - forages2

using Jchemo, JchemoData
using JLD2, CairoMakie
using FreqTables

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/forages2.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (485, 700)
3×700 DataFrame
600 columns omitted
Row1100110211041106110811101112111411161118112011221124112611281130113211341136113811401142114411461148115011521154115611581160116211641166116811701172117411761178118011821184118611881190119211941196119812001202120412061208121012121214121612181220122212241226122812301232123412361238124012421244124612481250125212541256125812601262126412661268127012721274127612781280128212841286128812901292129412961298
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
1-0.000231591-0.000175945-8.48176e-52.05217e-50.0001100940.0001617570.0001549530.0001637540.0001876020.000214990.0002424790.0002654980.0002821410.0002814420.0002710250.0002610750.0002572840.0002521770.000242930.0002282950.0002190970.0002141360.0002156120.0002189820.0002280040.0002360810.0002360170.0002203270.0001870960.0001371387.68593e-51.13679e-5-5.00951e-5-9.54664e-5-0.000119199-0.000131897-0.000142349-0.000161489-0.00019387-0.000244808-0.000303259-0.000366904-0.000416738-0.000451535-0.00046995-0.000478637-0.000477348-0.000478142-0.000476719-0.000479701-0.000482037-0.000496769-0.000511959-0.000532094-0.000542661-0.000540188-0.000512715-0.00045798-0.000370395-0.000256256-0.0001269071.13716e-60.0001190470.0002127450.0002756850.0003078630.0003135470.0002969770.0002696610.0002478180.0002339440.0002287730.0002245670.0002212560.0002188930.0002177410.0002101440.000196640.0001819490.0001697740.0001516910.000123859.23378e-55.9959e-52.58352e-5-4.77314e-6-3.21835e-5-5.53154e-5-6.71707e-5-6.54166e-5-5.16448e-5-2.43366e-51.12255e-54.68917e-57.773e-50.0001067850.0001331730.0001536070.0001685180.000182591
2-9.66352e-5-3.30928e-55.64966e-50.0001541350.0002377250.0002957890.0003195870.0003574050.0004046110.0004479960.0004797860.0004883390.0004659290.0004023010.0003136480.0002202260.0001384837.35084e-53.50018e-52.83293e-56.05478e-50.0001182720.0001877260.0002498420.000296970.0003150620.0002988280.0002516430.0001870550.0001182435.60849e-53.8727e-6-3.28778e-5-4.84688e-5-4.38912e-5-3.34954e-5-2.72637e-5-3.65483e-5-6.62949e-5-0.000121833-0.000193587-0.000280244-0.000362132-0.000434981-0.000494461-0.000546531-0.000590606-0.000638514-0.000684688-0.000734688-0.000783664-0.000842714-0.000892596-0.000930301-0.000938118-0.000913585-0.000846217-0.000737781-0.000588122-0.000410395-0.000220611-3.69382e-50.0001310720.0002660780.0003583770.0004086840.0004245280.0004121470.0003838960.0003579570.0003383850.0003267490.0003155720.000305420.0002936710.0002800050.0002594820.0002336970.00020440.0001771990.0001479890.0001123257.33317e-53.48779e-5-2.5229e-6-3.27922e-5-5.52233e-5-7.06412e-5-7.49675e-5-6.44041e-5-4.04393e-5-6.50489e-63.09196e-56.87358e-50.0001052020.0001423130.0001771820.0002066520.0002307880.000253703
3-0.000131769-7.8398e-57.92223e-78.90044e-50.0001600220.0001984350.0001965980.0002122250.0002411090.0002712350.0003010450.0003249210.0003376190.0003258570.000299790.0002771670.000270180.000271650.0002776060.0002877220.0003082030.0003248470.0003285730.0003108060.000277280.0002268980.0001604748.30948e-57.98825e-6-5.32827e-5-9.57157e-5-0.000123438-0.0001371-0.000134382-0.00011527-9.07963e-5-6.97458e-5-6.29138e-5-7.14491e-5-9.85941e-5-0.000137562-0.000192678-0.000248177-0.000303993-0.000356125-0.000407616-0.0004553-0.000507819-0.000555473-0.000603436-0.000647099-0.000701763-0.000754429-0.000806879-0.000838493-0.000842167-0.000803445-0.000720829-0.000592138-0.000428566-0.000245567-6.43964e-50.0001011930.0002322420.0003221330.0003736050.0003918170.0003793320.0003478290.0003164950.0002922360.0002784310.0002646210.0002503050.0002393870.0002345040.0002246330.0002056840.0001804080.0001576150.0001351080.0001068717.3258e-53.90321e-57.34127e-6-1.78231e-5-3.94282e-5-5.6427e-5-6.15935e-5-5.19038e-5-2.96367e-53.09722e-63.98752e-57.62892e-50.0001082710.0001376320.0001656240.0001911820.0002115860.000229586
Y = dat.Y
@head Y
... (485, 4)
3×4 DataFrame
Rowdmndftyptest
Float64?Float64?StringInt64
192.2337.58Legume forages1
293.2649.6462Legume forages0
392.963.2939Forage trees0
y = Y.typ  # the classes 
test = Y.test
tab(y)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "Cereal and grass forages" => 160
  "Forage trees"             => 101
  "Legume forages"           => 224
freqtable(y, test)
3×2 Named Matrix{Int64}
             Dim1 ╲ Dim2 │   0    1
─────────────────────────┼─────────
Cereal and grass forages │ 100   60
Forage trees             │  56   45
Legume forages           │ 167   57
wlst = names(X)
wl = parse.(Int, wlst)
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
700-element Vector{Int64}:
 1100
 1102
 1104
 1106
 1108
 1110
 1112
 1114
 1116
 1118
    ⋮
 2482
 2484
 2486
 2488
 2490
 2492
 2494
 2496
 2498

Note:: X-data are already preprocessed (SNV + Savitsky-Golay 2nd deriv).

Split Tot to Train/Test

The model is fitted on Train, and a Test set is projected on the training space. In this example, Train is already defined in variable test of the dataset, and Test is defined by the remaining samples. But Tot could also be split a posteriori, for instance by sampling (random, systematic or any other designs). See for instance functions samprand, sampsys, etc.

s = Bool.(test)
Xtrain = rmrow(X, s)
ytrain = rmrow(y, s)
Xtest = X[s, :]
ytest = y[s]
ntot = nro(X)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)
(ntot = 485, ntrain = 323, ntest = 162)

Since X-columns are highly colinear, the Fda requires a regularization, for instance 1) by preliminary dimension reduction, or 2) by using a ridge regularization.

1) Fda on preliminary computed Pca scores

model0 = pcasvd(nlv = 10)
fit!(model0, Xtrain)
@names model0
@names model0.fitm
(:T, :V, :sv, :xmeans, :xscales, :weights, :niter, :par)
@head Ttrain_pca = model0.fitm.T
3×10 Matrix{Float64}:
 -0.00669306  -0.00650046  -0.00570717  …  -0.000492699   0.000134327
 -0.00387969  -0.0068879   -0.00728328      0.0003361    -0.000581481
  0.0022825    0.00305238   0.00109557     -0.00164726   -0.0014889
... (323, 10)
model = fda(nlv = 2)
#model = fdasvd(nlv = 2)     # alternative algorithm (same result)
fit!(model, Ttrain_pca, ytrain) 
fitm = model.fitm
@names fitm
(:T, :V, :Tcenters, :eig, :sstot, :W, :xmeans, :xscales, :weights, :lev, :ni, :par)
@head Ttrain = model.fitm.T
3×2 Matrix{Float64}:
  2.41503  -1.46642
  1.41666  -1.4342
 -1.54614  -0.344225
... (323, 2)
lev = fitm.lev
nlev = length(lev)
3
ct = fitm.Tcenters
3×2 Matrix{Float64}:
 -2.1618    -0.223723
  1.49851   -1.67024
  0.791993   0.694047
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2], markersize = 10, color = :red)
f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .5)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f

Projection of Xtest

@head Ttest_pca = transf(model0, Xtest)
3×10 Matrix{Float64}:
 -0.00881891  -0.00355513    0.00143197   …   0.000140379  -0.000265216
  0.00647425   0.00296032   -0.000761146     -0.00113056   -0.00160408
  0.00351213   0.000177464  -0.000182268     -0.00140921   -0.00232458
... (162, 10)
@head Ttest = transf(model, Ttest_pca)
3×2 Matrix{Float64}:
  1.76619  -0.105807
 -2.41847  -0.524465
 -2.57702  -0.907449
... (162, 2)
i = 1  # class "i" in test
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .5)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
s = ytest .== lev[i]
zT = Ttest[s, :]
scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue)
f

2) Fda directly on X but using a ridge regularization

Note: If the ridge parameter lb is too small (e.g. here from around 1e-10), the model can be overfitted and predictions of new observations is misleading.

lb = 1e-5
model = fda(; nlv = 2, lb)
#model = fdasvd(; nlv = 2, lb)
fit!(model, Xtrain, ytrain)
fitm = model.fitm
Jchemo.Fda([1.7829806394884713 -0.8923344270339282; 1.3203919349887605 -1.087112071286717; … ; -0.7042782276724117 0.523450171819936; -0.4631097344316628 1.0465334079722217], [5.171712666455356 -9.508299170939283; 5.153707863984817 -9.06078711679522; … ; 3.032245770823203 21.347000304208176; -2.8066707021640216 21.53541187066701], [-1.3213215617847378 -0.25815326934600685; 1.2872849511152809 -0.8352132611860519; 0.3595461012935211 0.43465430874861954], [0.8946608889469787, 0.2392545735461798, 8.113420527620786e-17, 7.522381965283366e-17, 1.2357532069459468e-17, 1.2357532069459468e-17, -5.640384069100635e-17, -5.640384069100635e-17, 3.953552944990857e-17, 3.953552944990857e-17  …  1.7581207434812442e-20, 2.101291977425629e-20, -1.8276419310707133e-20, 1.1596325005033801e-20, -3.75017290833722e-21, -3.75017290833722e-21, -9.096083385580036e-21, 8.960398722442791e-21, 5.4950099649394124e-21, 5.4950099649394124e-21], 1.1339154624931584, [1.0069729150340446e-5 6.666992825610627e-8 … 2.0918098869706948e-8 2.1162650478714127e-8; 6.666992825610627e-8 1.0063873539470998e-5 … 2.008643294430349e-8 2.0247774844963594e-8; … ; 2.0918098869706948e-8 2.008643294430349e-8 … 1.009178260196924e-5 9.092935251310007e-8; 2.1162650478714127e-8 2.0247774844963594e-8 … 9.092935251310007e-8 1.0095765380348832e-5], [-0.00019330623941241694, -0.0001415769474021643, -5.770661245116858e-5, 4.723558902506759e-5, 0.00014758462542250662, 0.00021809097323352518, 0.0002341317111996633, 0.000252799982295447, 0.0002796860235471441, 0.0003040280867910269  …  -0.0018442000247695933, -0.0019172866156863932, -0.0019496441042941074, -0.0019473165883640694, -0.0018947013042759696, -0.0016716820169876765, -0.0013441905794144805, -0.0009918653793508437, -0.0006005900823885629, -0.00025528401464108526], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], Jchemo.Weight{Float64}([0.0030959752321981413, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417  …  0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413]), ["Cereal and grass forages", "Forage trees", "Legume forages"], [100, 56, 167], Jchemo.ParFda(2, 1.0e-5, :prop, false))
lev = fitm.lev
nlev = length(lev)
3
ct = fitm.Tcenters
3×2 Matrix{Float64}:
 -1.32132   -0.258153
  1.28728   -0.835213
  0.359546   0.434654
@head Ttrain = fitm.T
3×2 Matrix{Float64}:
  1.78298   -0.892334
  1.32039   -1.08711
 -0.881565  -0.00540248
... (323, 2)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f

Projection of Xtest

@head Ttest = transf(model, Xtest)
3×2 Matrix{Float64}:
  1.47243    0.224802
 -1.39627   -0.2853
 -0.995825  -0.334509
... (162, 2)
i = 1  # class "i" in test
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, 
    title = string("Projection test-class ", lev[i], " (blue points)"), ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
s = ytest .== lev[i]
zT = Ttest[s, :]
scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue)
f