Prediction - tecator - Kplsr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (178, 100)
3×100 DataFrame
Row8508528548568588608628648668688708728748768788808828848868888908928948968989009029049069089109129149169189209229249269289309329349369389409429449469489509529549569589609629649669689709729749769789809829849869889909929949969981000100210041006100810101012101410161018102010221024102610281030103210341036103810401042104410461048
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12.617762.618142.618592.619122.619812.620712.621862.623342.625112.627222.629642.632452.635652.639332.643532.648252.65352.659372.665852.672812.680082.687332.694272.700732.706842.712812.719142.726282.734622.744162.754662.765682.776792.78792.799492.812252.827062.843562.861062.878572.894972.909242.920852.930152.938462.947712.960192.978313.003063.035063.074283.119633.168683.217713.262543.299883.328473.348993.363423.373793.381523.387413.391643.394183.39493.393663.390453.385413.378693.370413.360733.349793.337693.324433.310133.294873.278913.262323.245423.228283.21083.192873.174333.155033.134753.113393.091163.06853.045963.023933.002472.981452.960722.940132.919782.899662.879642.85962.83942.8192
22.834542.838712.842832.847052.851382.855872.86062.865662.870932.876612.882642.888982.895772.903082.910972.919532.928732.938632.949292.960722.972722.984932.99693.008333.01923.02993.041013.053453.067773.084163.102213.121063.139833.15813.176233.195193.215843.237473.258893.278353.293843.303623.306813.303933.2973.289253.284093.285053.293263.309233.332673.362513.396613.431883.464923.492953.514583.530043.540673.547973.553063.556753.559213.560453.560343.558763.555713.551323.545853.53953.532353.524423.515833.506683.4973.486833.476263.465523.455013.444813.434773.424653.414193.403033.390823.377313.362653.347453.332453.318183.304733.291863.279213.266553.253693.240453.226593.211813.1963.17942
32.582842.584582.586292.588082.589962.591922.594012.596272.598732.601312.604142.607142.610292.613612.617142.620892.624862.629092.633612.638352.64332.648382.653542.65872.663752.66882.673832.678922.684112.689372.69472.700122.705632.711412.717752.72492.733442.743272.754332.766422.779312.792722.806492.820642.835412.851212.868722.889052.912892.940882.973253.009463.04783.085543.119473.146963.166773.179383.186313.189243.18953.188013.184983.180393.174113.166113.156413.145123.132413.118433.103293.087143.070143.052373.033933.015042.995692.976122.956422.93662.916672.896552.876222.855632.834742.813612.792352.771132.750152.729562.709342.689512.670092.651122.632622.614612.597182.580342.564042.54816
Y = dat.Y 
@head Y
... (178, 4)
3×4 DataFrame
Rowwaterfatproteintyp
Float64Float64Float64String
160.522.516.7train
246.040.113.5train
371.08.420.5train
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32
wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
ntot = ntrain + ntest
(ntot = ntot, ntrain, ntest)
(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

Model fitting

gamma = 100
nlv = 15
model = kplsr(; gamma, nlv)   # For direct KPLSR (Bennett & Embrechts, 2003), use as below:
#model = dkplsr(; gamma, nlv)  
fit!(model, Xtrain, ytrain)
@names model
fitm = model.fitm
@names fitm
(:X, :Kt, :T, :C, :U, :R, :DKt, :vtot, :xscales, :ymeans, :yscales, :weights, :iter, :kwargs, :par)

Predictions

@head pred = predict(model, Xtest).pred
3×1 Matrix{Float64}:
 30.27263234256255
  1.2491123455562665
  4.4710390881702065
... (63, 1)
@head predict(model, Xtest; nlv = 2).pred
3×1 Matrix{Float64}:
 29.32135166200425
  2.162005442116719
  5.685860978803078
... (63, 1)
predict(model, Xtest; nlv = 0:2).pred
3-element Vector{Matrix{Float64}}:
 [18.358260869565218; 18.358260869565218; … ; 18.358260869565218; 18.358260869565218;;]
 [26.864333587051988; 4.1217670557698955; … ; 36.33040026195786; 45.001339268847815;;]
 [29.32135166200425; 2.162005442116719; … ; 39.208838218241105; 46.42134811761902;;]
rmsep(pred, ytest)
1×1 Matrix{Float64}:
 0.5511112753976203
bias(pred, ytest)
1×1 Matrix{Float64}:
 -0.15413101976451102
mse(pred, ytest)
1×10 DataFrame
Rownamrmsepsepbiascor2r2rpdrpdrmeann
StringFloat64Float64Float64Float64Float64Float64Float64Float64Int64
1y10.5510.529-0.1540.9980.99822.99529.67318.41463
@head r = residreg(pred, ytest) # residuals
3×1 Matrix{Float64}:
 -0.47263234256254805
  0.15088765444373342
  0.1289609118297932
... (63, 1)

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Prediction", ylabel = "Observed").f

Residuals

plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = string("Test set - variable ", nam), 
    xlabel = "Observed (Test)", ylabel = "Residuals").f

Adding a smoothing

zpred = vec(pred)
zr = vec(r)
63-element Vector{Float64}:
 -0.47263234256254805
  0.15088765444373342
  0.1289609118297932
 -0.010875159198107553
  1.4489775124476338
  0.290389476158623
  0.2844942468389391
 -0.06310162831640298
  0.22063704374842352
  0.24732308729945274
  ⋮
  0.12741438205125277
 -0.1781680431607029
  0.9736514052289138
 -0.3613955588708784
 -0.02422859343892725
  1.34464900017079
  0.09247884206364887
 -0.5730256984462514
  0.16607155085765157

Predictions

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, ytest)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Predicted", ylabel = "Observed")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
f

Residuals

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, zr)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, zr; size = (500, 400), title = string("Test set - variable ", nam), 
    xlabel = "Predictions", ylabel = "Residuals")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
hlines!(ax, 0; color = :grey)
f