using Jchemo, JchemoData using JLD2, CairoMakie
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/tecator.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X @head X
... (178, 100)
Row | 850 | 852 | 854 | 856 | 858 | 860 | 862 | 864 | 866 | 868 | 870 | 872 | 874 | 876 | 878 | 880 | 882 | 884 | 886 | 888 | 890 | 892 | 894 | 896 | 898 | 900 | 902 | 904 | 906 | 908 | 910 | 912 | 914 | 916 | 918 | 920 | 922 | 924 | 926 | 928 | 930 | 932 | 934 | 936 | 938 | 940 | 942 | 944 | 946 | 948 | 950 | 952 | 954 | 956 | 958 | 960 | 962 | 964 | 966 | 968 | 970 | 972 | 974 | 976 | 978 | 980 | 982 | 984 | 986 | 988 | 990 | 992 | 994 | 996 | 998 | 1000 | 1002 | 1004 | 1006 | 1008 | 1010 | 1012 | 1014 | 1016 | 1018 | 1020 | 1022 | 1024 | 1026 | 1028 | 1030 | 1032 | 1034 | 1036 | 1038 | 1040 | 1042 | 1044 | 1046 | 1048 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | |
1 | 2.61776 | 2.61814 | 2.61859 | 2.61912 | 2.61981 | 2.62071 | 2.62186 | 2.62334 | 2.62511 | 2.62722 | 2.62964 | 2.63245 | 2.63565 | 2.63933 | 2.64353 | 2.64825 | 2.6535 | 2.65937 | 2.66585 | 2.67281 | 2.68008 | 2.68733 | 2.69427 | 2.70073 | 2.70684 | 2.71281 | 2.71914 | 2.72628 | 2.73462 | 2.74416 | 2.75466 | 2.76568 | 2.77679 | 2.7879 | 2.79949 | 2.81225 | 2.82706 | 2.84356 | 2.86106 | 2.87857 | 2.89497 | 2.90924 | 2.92085 | 2.93015 | 2.93846 | 2.94771 | 2.96019 | 2.97831 | 3.00306 | 3.03506 | 3.07428 | 3.11963 | 3.16868 | 3.21771 | 3.26254 | 3.29988 | 3.32847 | 3.34899 | 3.36342 | 3.37379 | 3.38152 | 3.38741 | 3.39164 | 3.39418 | 3.3949 | 3.39366 | 3.39045 | 3.38541 | 3.37869 | 3.37041 | 3.36073 | 3.34979 | 3.33769 | 3.32443 | 3.31013 | 3.29487 | 3.27891 | 3.26232 | 3.24542 | 3.22828 | 3.2108 | 3.19287 | 3.17433 | 3.15503 | 3.13475 | 3.11339 | 3.09116 | 3.0685 | 3.04596 | 3.02393 | 3.00247 | 2.98145 | 2.96072 | 2.94013 | 2.91978 | 2.89966 | 2.87964 | 2.8596 | 2.8394 | 2.8192 |
2 | 2.83454 | 2.83871 | 2.84283 | 2.84705 | 2.85138 | 2.85587 | 2.8606 | 2.86566 | 2.87093 | 2.87661 | 2.88264 | 2.88898 | 2.89577 | 2.90308 | 2.91097 | 2.91953 | 2.92873 | 2.93863 | 2.94929 | 2.96072 | 2.97272 | 2.98493 | 2.9969 | 3.00833 | 3.0192 | 3.0299 | 3.04101 | 3.05345 | 3.06777 | 3.08416 | 3.10221 | 3.12106 | 3.13983 | 3.1581 | 3.17623 | 3.19519 | 3.21584 | 3.23747 | 3.25889 | 3.27835 | 3.29384 | 3.30362 | 3.30681 | 3.30393 | 3.297 | 3.28925 | 3.28409 | 3.28505 | 3.29326 | 3.30923 | 3.33267 | 3.36251 | 3.39661 | 3.43188 | 3.46492 | 3.49295 | 3.51458 | 3.53004 | 3.54067 | 3.54797 | 3.55306 | 3.55675 | 3.55921 | 3.56045 | 3.56034 | 3.55876 | 3.55571 | 3.55132 | 3.54585 | 3.5395 | 3.53235 | 3.52442 | 3.51583 | 3.50668 | 3.497 | 3.48683 | 3.47626 | 3.46552 | 3.45501 | 3.44481 | 3.43477 | 3.42465 | 3.41419 | 3.40303 | 3.39082 | 3.37731 | 3.36265 | 3.34745 | 3.33245 | 3.31818 | 3.30473 | 3.29186 | 3.27921 | 3.26655 | 3.25369 | 3.24045 | 3.22659 | 3.21181 | 3.196 | 3.17942 |
3 | 2.58284 | 2.58458 | 2.58629 | 2.58808 | 2.58996 | 2.59192 | 2.59401 | 2.59627 | 2.59873 | 2.60131 | 2.60414 | 2.60714 | 2.61029 | 2.61361 | 2.61714 | 2.62089 | 2.62486 | 2.62909 | 2.63361 | 2.63835 | 2.6433 | 2.64838 | 2.65354 | 2.6587 | 2.66375 | 2.6688 | 2.67383 | 2.67892 | 2.68411 | 2.68937 | 2.6947 | 2.70012 | 2.70563 | 2.71141 | 2.71775 | 2.7249 | 2.73344 | 2.74327 | 2.75433 | 2.76642 | 2.77931 | 2.79272 | 2.80649 | 2.82064 | 2.83541 | 2.85121 | 2.86872 | 2.88905 | 2.91289 | 2.94088 | 2.97325 | 3.00946 | 3.0478 | 3.08554 | 3.11947 | 3.14696 | 3.16677 | 3.17938 | 3.18631 | 3.18924 | 3.1895 | 3.18801 | 3.18498 | 3.18039 | 3.17411 | 3.16611 | 3.15641 | 3.14512 | 3.13241 | 3.11843 | 3.10329 | 3.08714 | 3.07014 | 3.05237 | 3.03393 | 3.01504 | 2.99569 | 2.97612 | 2.95642 | 2.9366 | 2.91667 | 2.89655 | 2.87622 | 2.85563 | 2.83474 | 2.81361 | 2.79235 | 2.77113 | 2.75015 | 2.72956 | 2.70934 | 2.68951 | 2.67009 | 2.65112 | 2.63262 | 2.61461 | 2.59718 | 2.58034 | 2.56404 | 2.54816 |
Y = dat.Y @head Y
... (178, 4)
Row | water | fat | protein | typ |
---|---|---|---|---|
Float64 | Float64 | Float64 | String | |
1 | 60.5 | 22.5 | 16.7 | train |
2 | 46.0 | 40.1 | 13.5 | train |
3 | 71.0 | 8.4 | 20.5 | train |
typ = Y.typ tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries: "test" => 31 "train" => 115 "val" => 32
wlst = names(X) wl = parse.(Float64, wlst) #plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}: 850.0 852.0 854.0 856.0 858.0 860.0 862.0 864.0 866.0 868.0 ⋮ 1032.0 1034.0 1036.0 1038.0 1040.0 1042.0 1044.0 1046.0 1048.0
Preprocessing
model1 = snv() model2 = savgol(npoint = 15, deriv = 2, degree = 3) model = pip(model1, model2) fit!(model, X) @head Xp = transf(model, X) #plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}: 0.000397076 0.000495203 0.000598623 … 0.00827138 0.00917311 0.00946072 0.00242055 0.00244366 0.00234233 0.00580631 0.00689249 0.00749316 0.0011927 0.00122721 0.00120098 0.0101019 0.0108142 0.0108444 ... (178, 100)
s = typ .== "train" Xtrain = Xp[s, :] Ytrain = Y[s, :] Xtest = rmrow(Xp, s) Ytest = rmrow(Y, s) ntrain = nro(Xtrain) ntest = nro(Xtest) ntot = ntrain + ntest (ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)
Working response y
namy = names(Y)[1:3] j = 2 nam = namy[j] # work on the j-th y-variable ytrain = Ytrain[:, nam] ytest = Ytest[:, nam]
63-element Vector{Float64}: 29.8 1.4 4.6 11.0 17.0 22.4 27.9 46.5 6.1 2.0 ⋮ 18.1 19.4 24.8 27.2 28.4 31.3 33.8 35.5 42.5
Replicated K-fold CV
K = 3 # nb. folds (segments) rep = 10 # nb. replications segm = segmkf(ntrain, K; rep = rep)
10-element Vector{Vector{Vector{Int64}}}: [[2, 6, 7, 9, 16, 18, 19, 21, 30, 32 … 94, 96, 98, 99, 100, 107, 110, 111, 113, 114], [1, 4, 11, 17, 26, 28, 29, 37, 40, 41 … 93, 95, 101, 102, 104, 105, 106, 109, 112, 115], [3, 5, 8, 10, 12, 13, 14, 15, 20, 22 … 61, 71, 74, 76, 80, 81, 92, 97, 103, 108]] [[2, 3, 9, 10, 15, 18, 21, 23, 24, 25 … 88, 93, 94, 96, 98, 102, 104, 112, 114, 115], [7, 8, 12, 13, 14, 16, 26, 27, 29, 30 … 99, 100, 101, 103, 105, 106, 107, 109, 110, 113], [1, 4, 5, 6, 11, 17, 19, 20, 22, 32 … 79, 81, 83, 84, 85, 90, 91, 95, 108, 111]] [[2, 6, 11, 16, 18, 20, 22, 24, 25, 29 … 90, 97, 101, 103, 106, 108, 109, 110, 111, 113], [5, 8, 12, 13, 15, 21, 23, 26, 27, 35 … 82, 91, 94, 95, 98, 99, 102, 104, 114, 115], [1, 3, 4, 7, 9, 10, 14, 17, 19, 28 … 87, 88, 89, 92, 93, 96, 100, 105, 107, 112]] [[1, 5, 7, 10, 18, 21, 22, 23, 26, 29 … 86, 87, 94, 95, 97, 103, 106, 109, 111, 112], [2, 4, 8, 11, 12, 13, 15, 17, 19, 24 … 82, 89, 91, 92, 98, 99, 100, 107, 113, 114], [3, 6, 9, 14, 16, 20, 25, 28, 33, 34 … 90, 93, 96, 101, 102, 104, 105, 108, 110, 115]] [[3, 7, 13, 16, 21, 23, 26, 28, 31, 34 … 85, 86, 93, 98, 101, 104, 105, 106, 107, 114], [1, 2, 4, 5, 6, 8, 9, 11, 12, 14 … 83, 88, 91, 92, 96, 108, 109, 110, 111, 113], [10, 15, 17, 19, 20, 24, 29, 30, 33, 36 … 90, 94, 95, 97, 99, 100, 102, 103, 112, 115]] [[4, 5, 7, 14, 15, 16, 18, 19, 22, 23 … 83, 84, 89, 99, 100, 104, 105, 106, 107, 109], [1, 6, 9, 10, 11, 17, 28, 31, 33, 35 … 88, 90, 93, 94, 96, 97, 101, 102, 110, 112], [2, 3, 8, 12, 13, 20, 21, 24, 25, 26 … 91, 92, 95, 98, 103, 108, 111, 113, 114, 115]] [[2, 7, 15, 16, 20, 22, 23, 27, 29, 30 … 91, 92, 94, 95, 98, 101, 102, 106, 108, 115], [4, 11, 13, 14, 18, 19, 21, 24, 32, 33 … 84, 86, 96, 97, 103, 107, 110, 111, 112, 114], [1, 3, 5, 6, 8, 9, 10, 12, 17, 25 … 87, 89, 90, 93, 99, 100, 104, 105, 109, 113]] [[2, 7, 10, 12, 14, 16, 22, 24, 29, 30 … 96, 99, 100, 101, 104, 105, 106, 111, 112, 115], [1, 3, 4, 5, 11, 13, 15, 18, 21, 26 … 91, 92, 93, 94, 95, 97, 102, 103, 110, 113], [6, 8, 9, 17, 19, 20, 23, 25, 27, 28 … 77, 78, 84, 88, 89, 98, 107, 108, 109, 114]] [[2, 3, 6, 8, 13, 15, 16, 20, 24, 25 … 81, 90, 91, 92, 94, 104, 106, 109, 110, 115], [1, 4, 9, 10, 11, 12, 14, 17, 28, 30 … 95, 98, 99, 100, 102, 105, 111, 112, 113, 114], [5, 7, 18, 19, 21, 22, 23, 26, 31, 32 … 80, 85, 86, 87, 96, 97, 101, 103, 107, 108]] [[4, 8, 10, 14, 22, 24, 30, 33, 36, 37 … 95, 96, 98, 100, 104, 107, 108, 110, 112, 113], [3, 5, 6, 9, 12, 13, 17, 18, 19, 20 … 73, 78, 85, 88, 92, 93, 97, 103, 114, 115], [1, 2, 7, 11, 15, 16, 21, 23, 28, 29 … 86, 87, 91, 99, 101, 102, 105, 106, 109, 111]]
nlvdis = [5; 10; 15]; metric = [:mah] # here only Mahalanobis distance is considered h = [1; 2; 4; 6; Inf]; k = [30; 50; 100] nlv = 0:15 pars = mpar(nlvdis = nlvdis, metric = metric, h = h, k = k) length(pars[1])
45
model = lwplsr() rescv = gridcv(model, Xtrain, ytrain; segm, score = rmsep, pars, nlv) @names rescv res = rescv.res res_rep = rescv.res_rep
Row | rep | segm | nlvdis | metric | h | k | nlv | y1 |
---|---|---|---|---|---|---|---|---|
Int64 | Int64 | Any | Any | Any | Any | Int64 | Float64 | |
1 | 1 | 1 | 5 | mah | 1.0 | 30 | 0 | 4.55673 |
2 | 1 | 1 | 5 | mah | 1.0 | 30 | 1 | 1.1839 |
3 | 1 | 1 | 5 | mah | 1.0 | 30 | 2 | 1.29055 |
4 | 1 | 1 | 5 | mah | 1.0 | 30 | 3 | 1.1621 |
5 | 1 | 1 | 5 | mah | 1.0 | 30 | 4 | 1.5221 |
6 | 1 | 1 | 5 | mah | 1.0 | 30 | 5 | 1.63031 |
7 | 1 | 1 | 5 | mah | 1.0 | 30 | 6 | 1.60902 |
8 | 1 | 1 | 5 | mah | 1.0 | 30 | 7 | 3.21498 |
9 | 1 | 1 | 5 | mah | 1.0 | 30 | 8 | 3.7293 |
10 | 1 | 1 | 5 | mah | 1.0 | 30 | 9 | 4.2155 |
11 | 1 | 1 | 5 | mah | 1.0 | 30 | 10 | 4.30623 |
12 | 1 | 1 | 5 | mah | 1.0 | 30 | 11 | 4.048 |
13 | 1 | 1 | 5 | mah | 1.0 | 30 | 12 | 3.60532 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
21589 | 10 | 3 | 15 | mah | Inf | 100 | 4 | 2.23052 |
21590 | 10 | 3 | 15 | mah | Inf | 100 | 5 | 2.16522 |
21591 | 10 | 3 | 15 | mah | Inf | 100 | 6 | 2.11402 |
21592 | 10 | 3 | 15 | mah | Inf | 100 | 7 | 2.05778 |
21593 | 10 | 3 | 15 | mah | Inf | 100 | 8 | 2.01392 |
21594 | 10 | 3 | 15 | mah | Inf | 100 | 9 | 1.81874 |
21595 | 10 | 3 | 15 | mah | Inf | 100 | 10 | 1.80617 |
21596 | 10 | 3 | 15 | mah | Inf | 100 | 11 | 1.83318 |
21597 | 10 | 3 | 15 | mah | Inf | 100 | 12 | 1.90184 |
21598 | 10 | 3 | 15 | mah | Inf | 100 | 13 | 2.08162 |
21599 | 10 | 3 | 15 | mah | Inf | 100 | 14 | 2.05006 |
21600 | 10 | 3 | 15 | mah | Inf | 100 | 15 | 2.00492 |
group = string.("nvldis=", res.nlvdis, " h=", res.h, " k=", res.k) plotgrid(res.nlv, res.y1, group; step = 2, xlabel ="Nb. LVs", ylabel = "RMSEP-Val").f
Selection of the best parameter combination
u = findall(res.y1 .== minimum(res.y1))[1] res[u, :]
Row | nlv | nlvdis | metric | h | k | y1 |
---|---|---|---|---|---|---|
Int64 | Any | Any | Any | Any | Float64 | |
3 | 2 | 5 | mah | 1.0 | 30 | 1.15901 |
model = lwplsr(nlvdis = res.nlvdis[u], metric = res.metric[u], h = res.h[u], k = res.k[u], nlv = res.nlv[u]) fit!(model, Xtrain, ytrain) pred = predict(model, Xtest).pred
63×1 Matrix{Float64}: 29.233919470654392 1.3973931780971185 4.794708197493841 11.214014955076081 15.40594497215125 20.431938204221815 27.011452750213174 47.98443636636166 5.256365240224742 1.869392428285745 ⋮ 17.70770463351476 19.115522708954746 21.597608245182037 27.614157967846275 27.667973750722332 29.66398089648304 33.922975891981565 36.54196651755939 41.810372978453216
Generalization error
rmsep(pred, ytest)
1×1 Matrix{Float64}: 1.0303218875423956
Plotting predictions vs. observed data
plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), xlabel = "Prediction", ylabel = "Observed").f