using Jchemo, JchemoData using JLD2, CairoMakie
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/tecator.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X Y = dat.Y ntot, p = size(X) typ = Y.typ tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries: "test" => 31 "train" => 115 "val" => 32
wlst = names(X) wl = parse.(Float64, wlst) #plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}: 850.0 852.0 854.0 856.0 858.0 860.0 862.0 864.0 866.0 868.0 ⋮ 1032.0 1034.0 1036.0 1038.0 1040.0 1042.0 1044.0 1046.0 1048.0
Preprocessing
model1 = snv() model2 = savgol(npoint = 15, deriv = 2, degree = 3) model = pip(model1, model2) fit!(model, X) @head Xp = transf(model, X) #plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}: 0.000397076 0.000495203 0.000598623 … 0.00827138 0.00917311 0.00946072 0.00242055 0.00244366 0.00234233 0.00580631 0.00689249 0.00749316 0.0011927 0.00122721 0.00120098 0.0101019 0.0108142 0.0108444 ... (178, 100)
s = typ .== "train" Xtrain = Xp[s, :] Ytrain = Y[s, :] Xtest = rmrow(Xp, s) Ytest = rmrow(Y, s) ntrain = nro(Xtrain) ntest = nro(Xtest) (ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)
Working response y
namy = names(Y)[1:3] j = 2 nam = namy[j] # work on the j-th y-variable ytrain = Ytrain[:, nam] ytest = Ytest[:, nam]
63-element Vector{Float64}: 29.8 1.4 4.6 11.0 17.0 22.4 27.9 46.5 6.1 2.0 ⋮ 18.1 19.4 24.8 27.2 28.4 31.3 33.8 35.5 42.5
n_trees = 100 partial_sampling = .7 n_subfeatures = p / 3 max_depth = 20 model = rfr(; n_trees, partial_sampling, n_subfeatures, max_depth) fit!(model, Xtrain, ytrain) @names model fitm = model.fitm @names fitm
(:fitm, :xscales, :featur, :par)
@head pred = predict(model, Xtest).pred
3×1 Matrix{Float64}: 29.969889285714284 4.99329880952381 5.798538492063492 ... (63, 1)
rmsep(pred, ytest)
1×1 Matrix{Float64}: 1.5464075886596087
bias(pred, ytest)
1×1 Matrix{Float64}: 0.03305490677752686
mse(pred, ytest)
Row | nam | rmsep | sep | bias | cor2 | r2 | rpd | rpdr | mean | n |
---|---|---|---|---|---|---|---|---|---|---|
String | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Int64 | |
1 | y1 | 1.546 | 1.546 | 0.033 | 0.987 | 0.985 | 8.195 | 10.17 | 18.414 | 63 |
@head r = residreg(pred, ytest) # residuals
3×1 Matrix{Float64}: -0.16988928571428374 -3.59329880952381 -1.1985384920634923 ... (63, 1)
plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), xlabel = "Prediction", ylabel = "Observed").f
Residuals
plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = string("Test set - variable ", nam), xlabel = "Observed (Test)", ylabel = "Residuals").f
zpred = vec(pred) zr = vec(r)
63-element Vector{Float64}: -0.16988928571428374 -3.59329880952381 -1.1985384920634923 0.09710079365079238 0.9599015873015873 3.046315873015871 -0.7964654761904804 2.476778571428568 -3.7117234126984133 -3.330197222222223 ⋮ 0.5688615079365071 1.005428968253966 -2.8414476190476208 -0.45588611111111277 0.5691015873015886 -0.19207738095238014 0.20291785714285027 -1.6085869047619 -0.10876944444444803
Predictions
model_lo = loessr(span = 1/2) fit!(model_lo, zpred, ytest) pred_lo = predict(model_lo, sort(zpred)).pred f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = string("Test set - variable ", nam), xlabel = "Predicted", ylabel = "Observed") lines!(ax, sort(zpred), vec(pred_lo); color = :red) f
Residuals
model_lo = loessr(span = 1/2) fit!(model_lo, zpred, zr) pred_lo = predict(model_lo, sort(zpred)).pred f, ax = plotxy(zpred, zr; size = (500, 400), title = string("Test set - variable ", nam), xlabel = "Predictions", ylabel = "Residuals") lines!(ax, sort(zpred), vec(pred_lo); color = :red) hlines!(ax, 0; color = :grey) f