Prediction - tecator - Rfr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat

(:X, :Y)

X = dat.X
Y = dat.Y 
ntot, p = size(X)
typ = Y.typ
tab(typ)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32

wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)

(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]

63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

Model fitting

n_trees = 100
partial_sampling = .7
n_subfeatures = p / 3
max_depth = 20
model = rfr(; n_trees, partial_sampling, n_subfeatures, max_depth)
fit!(model, Xtrain, ytrain)
@names model
fitm = model.fitm
@names fitm

(:fitm, :xscales, :featur, :par)

Predictions

@head pred = predict(model, Xtest).pred

3×1 Matrix{Float64}:
 29.969889285714284
  4.99329880952381
  5.798538492063492
... (63, 1)

rmsep(pred, ytest)

1×1 Matrix{Float64}:
 1.5464075886596087

bias(pred, ytest)

1×1 Matrix{Float64}:
 0.03305490677752686

mse(pred, ytest)

1×10 DataFrame

Row	nam	rmsep	sep	bias	cor2	r2	rpd	rpdr	mean	n
	String	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Int64
1	y1	1.546	1.546	0.033	0.987	0.985	8.195	10.17	18.414	63

@head r = residreg(pred, ytest) # residuals

3×1 Matrix{Float64}:
 -0.16988928571428374
 -3.59329880952381
 -1.1985384920634923
... (63, 1)

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Prediction", ylabel = "Observed").f

Residuals

plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = string("Test set - variable ", nam), 
    xlabel = "Observed (Test)", ylabel = "Residuals").f

Adding a smoothing

zpred = vec(pred)
zr = vec(r)

63-element Vector{Float64}:
 -0.16988928571428374
 -3.59329880952381
 -1.1985384920634923
  0.09710079365079238
  0.9599015873015873
  3.046315873015871
 -0.7964654761904804
  2.476778571428568
 -3.7117234126984133
 -3.330197222222223
  ⋮
  0.5688615079365071
  1.005428968253966
 -2.8414476190476208
 -0.45588611111111277
  0.5691015873015886
 -0.19207738095238014
  0.20291785714285027
 -1.6085869047619
 -0.10876944444444803

Predictions

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, ytest)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Predicted", ylabel = "Observed")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
f

Residuals

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, zr)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, zr; size = (500, 400), title = string("Test set - variable ", nam), 
    xlabel = "Predictions", ylabel = "Residuals")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
hlines!(ax, 0; color = :grey)
f