Prediction - tecator - Plsr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (178, 100)
3×100 DataFrame
Row8508528548568588608628648668688708728748768788808828848868888908928948968989009029049069089109129149169189209229249269289309329349369389409429449469489509529549569589609629649669689709729749769789809829849869889909929949969981000100210041006100810101012101410161018102010221024102610281030103210341036103810401042104410461048
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12.617762.618142.618592.619122.619812.620712.621862.623342.625112.627222.629642.632452.635652.639332.643532.648252.65352.659372.665852.672812.680082.687332.694272.700732.706842.712812.719142.726282.734622.744162.754662.765682.776792.78792.799492.812252.827062.843562.861062.878572.894972.909242.920852.930152.938462.947712.960192.978313.003063.035063.074283.119633.168683.217713.262543.299883.328473.348993.363423.373793.381523.387413.391643.394183.39493.393663.390453.385413.378693.370413.360733.349793.337693.324433.310133.294873.278913.262323.245423.228283.21083.192873.174333.155033.134753.113393.091163.06853.045963.023933.002472.981452.960722.940132.919782.899662.879642.85962.83942.8192
22.834542.838712.842832.847052.851382.855872.86062.865662.870932.876612.882642.888982.895772.903082.910972.919532.928732.938632.949292.960722.972722.984932.99693.008333.01923.02993.041013.053453.067773.084163.102213.121063.139833.15813.176233.195193.215843.237473.258893.278353.293843.303623.306813.303933.2973.289253.284093.285053.293263.309233.332673.362513.396613.431883.464923.492953.514583.530043.540673.547973.553063.556753.559213.560453.560343.558763.555713.551323.545853.53953.532353.524423.515833.506683.4973.486833.476263.465523.455013.444813.434773.424653.414193.403033.390823.377313.362653.347453.332453.318183.304733.291863.279213.266553.253693.240453.226593.211813.1963.17942
32.582842.584582.586292.588082.589962.591922.594012.596272.598732.601312.604142.607142.610292.613612.617142.620892.624862.629092.633612.638352.64332.648382.653542.65872.663752.66882.673832.678922.684112.689372.69472.700122.705632.711412.717752.72492.733442.743272.754332.766422.779312.792722.806492.820642.835412.851212.868722.889052.912892.940882.973253.009463.04783.085543.119473.146963.166773.179383.186313.189243.18953.188013.184983.180393.174113.166113.156413.145123.132413.118433.103293.087143.070143.052373.033933.015042.995692.976122.956422.93662.916672.896552.876222.855632.834742.813612.792352.771132.750152.729562.709342.689512.670092.651122.632622.614612.597182.580342.564042.54816
Y = dat.Y 
@head Y
... (178, 4)
3×4 DataFrame
Rowwaterfatproteintyp
Float64Float64Float64String
160.522.516.7train
246.040.113.5train
371.08.420.5train
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

The model is fitted on Train, and the generalization error is estimated on Test. In this example, Train is already defined in variable typ of the dataset, and Test is defined by the remaining samples. But Tot could also be split a posteriori, for instance by sampling (random, systematic or any other designs). See for instance functions samprand, sampsys, etc.

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
ntot = ntrain + ntest
(ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

Model fitting

nlv = 15
model = plskern(; nlv)    # which is the same as below:
#model = plskern(nlv = 15)
fit!(model, Xtrain, ytrain)
@names model
fitm = model.fitm
@names fitm
(:T, :V, :R, :W, :C, :TT, :xmeans, :xscales, :ymeans, :yscales, :weights, :niter, :par)

Predictions

@head pred = predict(model, Xtest).pred
3×1 Matrix{Float64}:
 28.05843593599626
  1.8059651842087483
  2.297135360637249
... (63, 1)
@head predict(model, Xtest; nlv = 2).pred
3×1 Matrix{Float64}:
 27.142800799929525
  3.37178013909228
  6.084176150483991
... (63, 1)
predict(model, Xtest; nlv = 0:2).pred
3-element Vector{Matrix{Float64}}:
 [18.358260869565218; 18.358260869565218; … ; 18.358260869565218; 18.358260869565218;;]
 [26.353438531759274; 3.578347466003631; … ; 35.282778241658455; 44.65310273731437;;]
 [27.142800799929525; 3.37178013909228; … ; 36.85307791183286; 46.551103183578405;;]
rmsep(pred, ytest)
1×1 Matrix{Float64}:
 2.2224756041608575
bias(pred, ytest)
1×1 Matrix{Float64}:
 -0.4081002705409793
mse(pred, ytest)
1×10 DataFrame
Rownamrmsepsepbiascor2r2rpdrpdrmeann
StringFloat64Float64Float64Float64Float64Float64Float64Float64Int64
1y12.2222.185-0.4080.9710.9695.7026.82118.41463
@head r = residreg(pred, ytest) # residuals
3×1 Matrix{Float64}:
  1.7415640640037395
 -0.4059651842087484
  2.3028646393627508
... (63, 1)

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Prediction", ylabel = "Observed").f

Residuals

plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = string("Test set - variable ", nam), 
    xlabel = "Observed (Test)", ylabel = "Residuals").f

Adding a smoothing

zpred = vec(pred)
zr = vec(r)
63-element Vector{Float64}:
  1.7415640640037395
 -0.4059651842087484
  2.3028646393627508
  2.4215292916618516
  2.689903568485679
  3.4334825776448987
  4.790417747930672
 -1.9506463671834382
 -2.433355667380658
  0.009360198067156489
  ⋮
  1.1841828198013395
  1.8171841946273588
  3.971694560641872
 -0.8114041015682361
  2.751311771673823
 -4.068890178764331
  1.100108549291022
 -2.2212746063934787
 -2.1817678223679877

Predictions

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, ytest)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Predicted", ylabel = "Observed")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
f

Residuals

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, zr)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, zr; size = (500, 400), title = string("Test set - variable ", nam), 
    xlabel = "Predictions", ylabel = "Residuals")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
hlines!(ax, 0; color = :grey)
f