using Jchemo, JchemoData using JLD2, CairoMakie using FreqTables
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/forages2.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X @head X
... (485, 700)
| Row | 1100 | 1102 | 1104 | 1106 | 1108 | 1110 | 1112 | 1114 | 1116 | 1118 | 1120 | 1122 | 1124 | 1126 | 1128 | 1130 | 1132 | 1134 | 1136 | 1138 | 1140 | 1142 | 1144 | 1146 | 1148 | 1150 | 1152 | 1154 | 1156 | 1158 | 1160 | 1162 | 1164 | 1166 | 1168 | 1170 | 1172 | 1174 | 1176 | 1178 | 1180 | 1182 | 1184 | 1186 | 1188 | 1190 | 1192 | 1194 | 1196 | 1198 | 1200 | 1202 | 1204 | 1206 | 1208 | 1210 | 1212 | 1214 | 1216 | 1218 | 1220 | 1222 | 1224 | 1226 | 1228 | 1230 | 1232 | 1234 | 1236 | 1238 | 1240 | 1242 | 1244 | 1246 | 1248 | 1250 | 1252 | 1254 | 1256 | 1258 | 1260 | 1262 | 1264 | 1266 | 1268 | 1270 | 1272 | 1274 | 1276 | 1278 | 1280 | 1282 | 1284 | 1286 | 1288 | 1290 | 1292 | 1294 | 1296 | 1298 | ⋯ |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | ⋯ | |
| 1 | -0.000231591 | -0.000175945 | -8.48176e-5 | 2.05217e-5 | 0.000110094 | 0.000161757 | 0.000154953 | 0.000163754 | 0.000187602 | 0.00021499 | 0.000242479 | 0.000265498 | 0.000282141 | 0.000281442 | 0.000271025 | 0.000261075 | 0.000257284 | 0.000252177 | 0.00024293 | 0.000228295 | 0.000219097 | 0.000214136 | 0.000215612 | 0.000218982 | 0.000228004 | 0.000236081 | 0.000236017 | 0.000220327 | 0.000187096 | 0.000137138 | 7.68593e-5 | 1.13679e-5 | -5.00951e-5 | -9.54664e-5 | -0.000119199 | -0.000131897 | -0.000142349 | -0.000161489 | -0.00019387 | -0.000244808 | -0.000303259 | -0.000366904 | -0.000416738 | -0.000451535 | -0.00046995 | -0.000478637 | -0.000477348 | -0.000478142 | -0.000476719 | -0.000479701 | -0.000482037 | -0.000496769 | -0.000511959 | -0.000532094 | -0.000542661 | -0.000540188 | -0.000512715 | -0.00045798 | -0.000370395 | -0.000256256 | -0.000126907 | 1.13716e-6 | 0.000119047 | 0.000212745 | 0.000275685 | 0.000307863 | 0.000313547 | 0.000296977 | 0.000269661 | 0.000247818 | 0.000233944 | 0.000228773 | 0.000224567 | 0.000221256 | 0.000218893 | 0.000217741 | 0.000210144 | 0.00019664 | 0.000181949 | 0.000169774 | 0.000151691 | 0.00012385 | 9.23378e-5 | 5.9959e-5 | 2.58352e-5 | -4.77314e-6 | -3.21835e-5 | -5.53154e-5 | -6.71707e-5 | -6.54166e-5 | -5.16448e-5 | -2.43366e-5 | 1.12255e-5 | 4.68917e-5 | 7.773e-5 | 0.000106785 | 0.000133173 | 0.000153607 | 0.000168518 | 0.000182591 | ⋯ |
| 2 | -9.66352e-5 | -3.30928e-5 | 5.64966e-5 | 0.000154135 | 0.000237725 | 0.000295789 | 0.000319587 | 0.000357405 | 0.000404611 | 0.000447996 | 0.000479786 | 0.000488339 | 0.000465929 | 0.000402301 | 0.000313648 | 0.000220226 | 0.000138483 | 7.35084e-5 | 3.50018e-5 | 2.83293e-5 | 6.05478e-5 | 0.000118272 | 0.000187726 | 0.000249842 | 0.00029697 | 0.000315062 | 0.000298828 | 0.000251643 | 0.000187055 | 0.000118243 | 5.60849e-5 | 3.8727e-6 | -3.28778e-5 | -4.84688e-5 | -4.38912e-5 | -3.34954e-5 | -2.72637e-5 | -3.65483e-5 | -6.62949e-5 | -0.000121833 | -0.000193587 | -0.000280244 | -0.000362132 | -0.000434981 | -0.000494461 | -0.000546531 | -0.000590606 | -0.000638514 | -0.000684688 | -0.000734688 | -0.000783664 | -0.000842714 | -0.000892596 | -0.000930301 | -0.000938118 | -0.000913585 | -0.000846217 | -0.000737781 | -0.000588122 | -0.000410395 | -0.000220611 | -3.69382e-5 | 0.000131072 | 0.000266078 | 0.000358377 | 0.000408684 | 0.000424528 | 0.000412147 | 0.000383896 | 0.000357957 | 0.000338385 | 0.000326749 | 0.000315572 | 0.00030542 | 0.000293671 | 0.000280005 | 0.000259482 | 0.000233697 | 0.0002044 | 0.000177199 | 0.000147989 | 0.000112325 | 7.33317e-5 | 3.48779e-5 | -2.5229e-6 | -3.27922e-5 | -5.52233e-5 | -7.06412e-5 | -7.49675e-5 | -6.44041e-5 | -4.04393e-5 | -6.50489e-6 | 3.09196e-5 | 6.87358e-5 | 0.000105202 | 0.000142313 | 0.000177182 | 0.000206652 | 0.000230788 | 0.000253703 | ⋯ |
| 3 | -0.000131769 | -7.8398e-5 | 7.92223e-7 | 8.90044e-5 | 0.000160022 | 0.000198435 | 0.000196598 | 0.000212225 | 0.000241109 | 0.000271235 | 0.000301045 | 0.000324921 | 0.000337619 | 0.000325857 | 0.00029979 | 0.000277167 | 0.00027018 | 0.00027165 | 0.000277606 | 0.000287722 | 0.000308203 | 0.000324847 | 0.000328573 | 0.000310806 | 0.00027728 | 0.000226898 | 0.000160474 | 8.30948e-5 | 7.98825e-6 | -5.32827e-5 | -9.57157e-5 | -0.000123438 | -0.0001371 | -0.000134382 | -0.00011527 | -9.07963e-5 | -6.97458e-5 | -6.29138e-5 | -7.14491e-5 | -9.85941e-5 | -0.000137562 | -0.000192678 | -0.000248177 | -0.000303993 | -0.000356125 | -0.000407616 | -0.0004553 | -0.000507819 | -0.000555473 | -0.000603436 | -0.000647099 | -0.000701763 | -0.000754429 | -0.000806879 | -0.000838493 | -0.000842167 | -0.000803445 | -0.000720829 | -0.000592138 | -0.000428566 | -0.000245567 | -6.43964e-5 | 0.000101193 | 0.000232242 | 0.000322133 | 0.000373605 | 0.000391817 | 0.000379332 | 0.000347829 | 0.000316495 | 0.000292236 | 0.000278431 | 0.000264621 | 0.000250305 | 0.000239387 | 0.000234504 | 0.000224633 | 0.000205684 | 0.000180408 | 0.000157615 | 0.000135108 | 0.000106871 | 7.3258e-5 | 3.90321e-5 | 7.34127e-6 | -1.78231e-5 | -3.94282e-5 | -5.6427e-5 | -6.15935e-5 | -5.19038e-5 | -2.96367e-5 | 3.09722e-6 | 3.98752e-5 | 7.62892e-5 | 0.000108271 | 0.000137632 | 0.000165624 | 0.000191182 | 0.000211586 | 0.000229586 | ⋯ |
Y = dat.Y @head Y
... (485, 4)
| Row | dm | ndf | typ | test |
|---|---|---|---|---|
| Float64? | Float64? | String | Int64 | |
| 1 | 92.23 | 37.58 | Legume forages | 1 |
| 2 | 93.26 | 49.6462 | Legume forages | 0 |
| 3 | 92.9 | 63.2939 | Forage trees | 0 |
y = Y.typ # response variable (class membership) # the classes test = Y.test tab(y)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"Cereal and grass forages" => 160
"Forage trees" => 101
"Legume forages" => 224
freqtable(y, test)
3×2 Named Matrix{Int64}
Dim1 ╲ Dim2 │ 0 1
─────────────────────────┼─────────
Cereal and grass forages │ 100 60
Forage trees │ 56 45
Legume forages │ 167 57
wlst = names(X) wl = parse.(Int, wlst) #plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
700-element Vector{Int64}:
1100
1102
1104
1106
1108
1110
1112
1114
1116
1118
⋮
2482
2484
2486
2488
2490
2492
2494
2496
2498
Note:: X-data are already preprocessed (SNV + Savitsky-Golay 2nd deriv).
The model is fitted on Train, and the generalization error is estimated on Test. In this example, Train is already defined in variable typ of the dataset, and Test is defined by the remaining samples. But Tot could also be split a posteriori, for instance by sampling (random, systematic or any other designs). See for instance functions samprand, sampsys, etc.
s = Bool.(test) Xtrain = rmrow(X, s) ytrain = rmrow(y, s) Xtest = X[s, :] ytest = y[s] ntot = nro(X) ntrain = nro(Xtrain) ntest = nro(Xtest) (ntot = ntot, ntrain, ntest)
(ntot = 485, ntrain = 323, ntest = 162)
Since X-columns are highly colinear, the Fda requires a regularization. It can be reached for instance 1) by preliminary dimension reduction, or 2) by using a ridge regularization.
model0 = pcasvd(nlv = 10) fit!(model0, Xtrain) @names model0 @names model0.fitm
(:T, :V, :sv, :xmeans, :xscales, :weights, :niter, :par)
@head Ttrain_pca = model0.fitm.T
3×10 Matrix{Float64}:
-0.00669306 -0.00650046 -0.00570717 … -0.000492699 0.000134327
-0.00387969 -0.0068879 -0.00728328 0.0003361 -0.000581481
0.0022825 0.00305238 0.00109557 -0.00164726 -0.0014889
... (323, 10)
model = fda(nlv = 2) #model = fdasvd(nlv = 2) # alternative algorithm (same result) fit!(model, Ttrain_pca, ytrain) fitm = model.fitm @names fitm
(:T, :V, :Tcenters, :eig, :sstot, :W, :xmeans, :xscales, :weights, :lev, :ni, :par)
@head Ttrain = model.fitm.T
3×2 Matrix{Float64}:
2.41503 -1.46642
1.41666 -1.4342
-1.54614 -0.344225
... (323, 2)
lev = fitm.lev nlev = length(lev)
3
ct = fitm.Tcenters
3×2 Matrix{Float64}:
-2.1618 -0.223723
1.49851 -1.67024
0.791993 0.694047
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2], markersize = 10, color = :red) f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .5) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) f
Projection of Xtest
@head Ttest_pca = transf(model0, Xtest)
3×10 Matrix{Float64}:
-0.00881891 -0.00355513 0.00143197 … 0.000140379 -0.000265216
0.00647425 0.00296032 -0.000761146 -0.00113056 -0.00160408
0.00351213 0.000177464 -0.000182268 -0.00140921 -0.00232458
... (162, 10)
@head Ttest = transf(model, Ttest_pca)
3×2 Matrix{Float64}:
1.76619 -0.105807
-2.41847 -0.524465
-2.57702 -0.907449
... (162, 2)
i = 1 # class "i" in test color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .5) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) s = ytest .== lev[i] zT = Ttest[s, :] scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue) f
Note: If the ridge parameter lb is too small (e.g. here from around 1e-10), the model can be overfitted and predictions of new observations are misleading.
lb = 1e-5 model = fda(; nlv = 2, lb) #model = fdasvd(; nlv = 2, lb) fit!(model, Xtrain, ytrain) fitm = model.fitm
Jchemo.Fda([1.7829806394884713 -0.8923344270339282; 1.3203919349887605 -1.087112071286717; … ; -0.7042782276724117 0.523450171819936; -0.4631097344316628 1.0465334079722217], [5.171712666455356 -9.508299170939283; 5.153707863984817 -9.06078711679522; … ; 3.032245770823203 21.347000304208176; -2.8066707021640216 21.53541187066701], [-1.3213215617847378 -0.25815326934600685; 1.2872849511152809 -0.8352132611860519; 0.3595461012935211 0.43465430874861954], [0.8946608889469787, 0.2392545735461798, 8.113420527620786e-17, 7.522381965283366e-17, 1.2357532069459468e-17, 1.2357532069459468e-17, -5.640384069100635e-17, -5.640384069100635e-17, 3.953552944990857e-17, 3.953552944990857e-17 … 1.7581207434812442e-20, 2.101291977425629e-20, -1.8276419310707133e-20, 1.1596325005033801e-20, -3.75017290833722e-21, -3.75017290833722e-21, -9.096083385580036e-21, 8.960398722442791e-21, 5.4950099649394124e-21, 5.4950099649394124e-21], 1.1339154624931584, [1.0069729150340446e-5 6.666992825610627e-8 … 2.0918098869706948e-8 2.1162650478714127e-8; 6.666992825610627e-8 1.0063873539470998e-5 … 2.008643294430349e-8 2.0247774844963594e-8; … ; 2.0918098869706948e-8 2.008643294430349e-8 … 1.009178260196924e-5 9.092935251310007e-8; 2.1162650478714127e-8 2.0247774844963594e-8 … 9.092935251310007e-8 1.0095765380348832e-5], [-0.00019330623941241694, -0.0001415769474021643, -5.770661245116858e-5, 4.723558902506759e-5, 0.00014758462542250662, 0.00021809097323352518, 0.0002341317111996633, 0.000252799982295447, 0.0002796860235471441, 0.0003040280867910269 … -0.0018442000247695933, -0.0019172866156863932, -0.0019496441042941074, -0.0019473165883640694, -0.0018947013042759696, -0.0016716820169876765, -0.0013441905794144805, -0.0009918653793508437, -0.0006005900823885629, -0.00025528401464108526], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 … 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], Jchemo.Weight{Float64}([0.0030959752321981413, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417 … 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981417, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413, 0.0030959752321981413]), ["Cereal and grass forages", "Forage trees", "Legume forages"], [100, 56, 167], Jchemo.ParFda(2, 1.0e-5, :prop, false))
lev = fitm.lev nlev = length(lev)
3
ct = fitm.Tcenters
3×2 Matrix{Float64}:
-1.32132 -0.258153
1.28728 -0.835213
0.359546 0.434654
@head Ttrain = fitm.T
3×2 Matrix{Float64}:
1.78298 -0.892334
1.32039 -1.08711
-0.881565 -0.00540248
... (323, 2)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) f
Projection of Xtest
@head Ttest = transf(model, Xtest)
3×2 Matrix{Float64}:
1.47243 0.224802
-1.39627 -0.2853
-0.995825 -0.334509
... (162, 2)
i = 1 # class "i" in test color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Projection test-class $(lev[i]) (blue points)", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) s = ytest .== lev[i] zT = Ttest[s, :] scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue) f