using Jchemo, JchemoData using JLD2, CairoMakie
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/tecator.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X Y = dat.Y ntot, p = size(X) typ = Y.typ tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"test" => 31
"train" => 115
"val" => 32
wlst = names(X) wl = parse.(Float64, wlst) #plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
850.0
852.0
854.0
856.0
858.0
860.0
862.0
864.0
866.0
868.0
⋮
1032.0
1034.0
1036.0
1038.0
1040.0
1042.0
1044.0
1046.0
1048.0
Preprocessing
model1 = snv() model2 = savgol(npoint = 15, deriv = 2, degree = 3) model = pip(model1, model2) fit!(model, X) @head Xp = transf(model, X) #plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
0.000397076 0.000495203 0.000598623 … 0.00827138 0.00917311 0.00946072
0.00242055 0.00244366 0.00234233 0.00580631 0.00689249 0.00749316
0.0011927 0.00122721 0.00120098 0.0101019 0.0108142 0.0108444
... (178, 100)
s = typ .== "train" Xtrain = Xp[s, :] Ytrain = Y[s, :] Xtest = rmrow(Xp, s) Ytest = rmrow(Y, s) ntrain = nro(Xtrain) ntest = nro(Xtest) (ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)
Working response y
namy = names(Y)[1:3] j = 2 nam = namy[j] # work on the j-th y-variable ytrain = Ytrain[:, nam] ytest = Ytest[:, nam]
63-element Vector{Float64}:
29.8
1.4
4.6
11.0
17.0
22.4
27.9
46.5
6.1
2.0
⋮
18.1
19.4
24.8
27.2
28.4
31.3
33.8
35.5
42.5
n_trees = 100 partial_sampling = .7 n_subfeatures = p / 3 max_depth = 20 model = rfr(; n_trees, partial_sampling, n_subfeatures, max_depth) fit!(model, Xtrain, ytrain) @names model fitm = model.fitm @names fitm
(:fitm, :xscales, :featur, :par)
@head pred = predict(model, Xtest).pred
3×1 Matrix{Float64}:
30.395226587301593
4.812776190476191
5.580654365079364
... (63, 1)
rmsep(pred, ytest)
1×1 Matrix{Float64}:
1.525067098723295
bias(pred, ytest)
1×1 Matrix{Float64}:
0.1024592214663642
mse(pred, ytest)
| Row | nam | rmsep | sep | bias | cor2 | r2 | rpd | rpdr | mean | n |
|---|---|---|---|---|---|---|---|---|---|---|
| String | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Int64 | |
| 1 | y1 | 1.525 | 1.522 | 0.102 | 0.987 | 0.986 | 8.31 | 9.601 | 18.414 | 63 |
@head r = residreg(pred, ytest) # residuals
3×1 Matrix{Float64}:
-0.5952265873015925
-3.412776190476191
-0.9806543650793644
... (63, 1)
plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = "Test set - variable $nam", xlabel = "Prediction", ylabel = "Observed").f
Residuals
plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = "Test set - variable $nam", xlabel = "Observed (Test)", ylabel = "Residuals").f
zpred = vec(pred) zr = vec(r)
63-element Vector{Float64}:
-0.5952265873015925
-3.412776190476191
-0.9806543650793644
0.2953337301587293
-0.09798928571428789
2.226941666666665
-0.9891309523809468
2.5479119047619037
-2.706648015873016
-3.3459666666666656
⋮
0.589401587301591
0.9979329365079366
-1.9505777777777737
-1.2082230158730134
0.4543837301587317
-0.028722222222221205
0.3284353174603112
-2.281190476190474
-0.13602023809523445
Predictions
model_lo = loessr(span = 1/2) fit!(model_lo, zpred, ytest) pred_lo = predict(model_lo, sort(zpred)).pred f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = "Test set - variable $nam", xlabel = "Predicted", ylabel = "Observed") lines!(ax, sort(zpred), vec(pred_lo); color = :red) f
Residuals
model_lo = loessr(span = 1/2) fit!(model_lo, zpred, zr) pred_lo = predict(model_lo, sort(zpred)).pred f, ax = plotxy(zpred, zr; size = (500, 400), title = "Test set - variable $nam", xlabel = "Predictions", ylabel = "Residuals") lines!(ax, sort(zpred), vec(pred_lo); color = :red) hlines!(ax, 0; color = :grey) f