Prediction - tecator - Rfr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
Y = dat.Y 
ntot, p = size(X)
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

Model fitting

n_trees = 100
partial_sampling = .7
n_subfeatures = p / 3
max_depth = 20
model = rfr(; n_trees, partial_sampling, n_subfeatures, max_depth)
fit!(model, Xtrain, ytrain)
@names model
fitm = model.fitm
@names fitm
(:fitm, :xscales, :featur, :par)

Predictions

@head pred = predict(model, Xtest).pred
3×1 Matrix{Float64}:
 30.395226587301593
  4.812776190476191
  5.580654365079364
... (63, 1)
rmsep(pred, ytest)
1×1 Matrix{Float64}:
 1.525067098723295
bias(pred, ytest)
1×1 Matrix{Float64}:
 0.1024592214663642
mse(pred, ytest)
1×10 DataFrame
Rownamrmsepsepbiascor2r2rpdrpdrmeann
StringFloat64Float64Float64Float64Float64Float64Float64Float64Int64
1y11.5251.5220.1020.9870.9868.319.60118.41463
@head r = residreg(pred, ytest) # residuals
3×1 Matrix{Float64}:
 -0.5952265873015925
 -3.412776190476191
 -0.9806543650793644
... (63, 1)

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = "Test set - variable $nam", 
    xlabel = "Prediction", ylabel = "Observed").f

Residuals

plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = "Test set - variable $nam", 
    xlabel = "Observed (Test)", ylabel = "Residuals").f

Adding a smoothing

zpred = vec(pred)
zr = vec(r)
63-element Vector{Float64}:
 -0.5952265873015925
 -3.412776190476191
 -0.9806543650793644
  0.2953337301587293
 -0.09798928571428789
  2.226941666666665
 -0.9891309523809468
  2.5479119047619037
 -2.706648015873016
 -3.3459666666666656
  ⋮
  0.589401587301591
  0.9979329365079366
 -1.9505777777777737
 -1.2082230158730134
  0.4543837301587317
 -0.028722222222221205
  0.3284353174603112
 -2.281190476190474
 -0.13602023809523445

Predictions

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, ytest)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = "Test set - variable $nam", 
    xlabel = "Predicted", ylabel = "Observed")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
f

Residuals

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, zr)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, zr; size = (500, 400), title = "Test set - variable $nam", xlabel = "Predictions", 
    ylabel = "Residuals")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
hlines!(ax, 0; color = :grey)
f