Prediction - tecator - Kplsr

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat

(:X, :Y)

X = dat.X
@head X

... (178, 100)

3×100 DataFrame

Row	850	852	854	856	858	860	862	864	866	868	870	872	874	876	878	880	882	884	886	888	890	892	894	896	898	900	902	904	906	908	910	912	914	916	918	920	922	924	926	928	930	932	934	936	938	940	942	944	946	948	950	952	954	956	958	960	962	964	966	968	970	972	974	976	978	980	982	984	986	988	990	992	994	996	998	1000	1002	1004	1006	1008	1010	1012	1014	1016	1018	1020	1022	1024	1026	1028	1030	1032	1034	1036	1038	1040	1042	1044	1046	1048
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	2.61776	2.61814	2.61859	2.61912	2.61981	2.62071	2.62186	2.62334	2.62511	2.62722	2.62964	2.63245	2.63565	2.63933	2.64353	2.64825	2.6535	2.65937	2.66585	2.67281	2.68008	2.68733	2.69427	2.70073	2.70684	2.71281	2.71914	2.72628	2.73462	2.74416	2.75466	2.76568	2.77679	2.7879	2.79949	2.81225	2.82706	2.84356	2.86106	2.87857	2.89497	2.90924	2.92085	2.93015	2.93846	2.94771	2.96019	2.97831	3.00306	3.03506	3.07428	3.11963	3.16868	3.21771	3.26254	3.29988	3.32847	3.34899	3.36342	3.37379	3.38152	3.38741	3.39164	3.39418	3.3949	3.39366	3.39045	3.38541	3.37869	3.37041	3.36073	3.34979	3.33769	3.32443	3.31013	3.29487	3.27891	3.26232	3.24542	3.22828	3.2108	3.19287	3.17433	3.15503	3.13475	3.11339	3.09116	3.0685	3.04596	3.02393	3.00247	2.98145	2.96072	2.94013	2.91978	2.89966	2.87964	2.8596	2.8394	2.8192
2	2.83454	2.83871	2.84283	2.84705	2.85138	2.85587	2.8606	2.86566	2.87093	2.87661	2.88264	2.88898	2.89577	2.90308	2.91097	2.91953	2.92873	2.93863	2.94929	2.96072	2.97272	2.98493	2.9969	3.00833	3.0192	3.0299	3.04101	3.05345	3.06777	3.08416	3.10221	3.12106	3.13983	3.1581	3.17623	3.19519	3.21584	3.23747	3.25889	3.27835	3.29384	3.30362	3.30681	3.30393	3.297	3.28925	3.28409	3.28505	3.29326	3.30923	3.33267	3.36251	3.39661	3.43188	3.46492	3.49295	3.51458	3.53004	3.54067	3.54797	3.55306	3.55675	3.55921	3.56045	3.56034	3.55876	3.55571	3.55132	3.54585	3.5395	3.53235	3.52442	3.51583	3.50668	3.497	3.48683	3.47626	3.46552	3.45501	3.44481	3.43477	3.42465	3.41419	3.40303	3.39082	3.37731	3.36265	3.34745	3.33245	3.31818	3.30473	3.29186	3.27921	3.26655	3.25369	3.24045	3.22659	3.21181	3.196	3.17942
3	2.58284	2.58458	2.58629	2.58808	2.58996	2.59192	2.59401	2.59627	2.59873	2.60131	2.60414	2.60714	2.61029	2.61361	2.61714	2.62089	2.62486	2.62909	2.63361	2.63835	2.6433	2.64838	2.65354	2.6587	2.66375	2.6688	2.67383	2.67892	2.68411	2.68937	2.6947	2.70012	2.70563	2.71141	2.71775	2.7249	2.73344	2.74327	2.75433	2.76642	2.77931	2.79272	2.80649	2.82064	2.83541	2.85121	2.86872	2.88905	2.91289	2.94088	2.97325	3.00946	3.0478	3.08554	3.11947	3.14696	3.16677	3.17938	3.18631	3.18924	3.1895	3.18801	3.18498	3.18039	3.17411	3.16611	3.15641	3.14512	3.13241	3.11843	3.10329	3.08714	3.07014	3.05237	3.03393	3.01504	2.99569	2.97612	2.95642	2.9366	2.91667	2.89655	2.87622	2.85563	2.83474	2.81361	2.79235	2.77113	2.75015	2.72956	2.70934	2.68951	2.67009	2.65112	2.63262	2.61461	2.59718	2.58034	2.56404	2.54816

Y = dat.Y 
@head Y

... (178, 4)

3×4 DataFrame

Row	water	fat	protein	typ
	Float64	Float64	Float64	String
1	60.5	22.5	16.7	train
2	46.0	40.1	13.5	train
3	71.0	8.4	20.5	train

typ = Y.typ
tab(typ)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32

wlst = names(X)
wl = parse.(Float64, wlst) 
#plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

100-element Vector{Float64}:
  850.0
  852.0
  854.0
  856.0
  858.0
  860.0
  862.0
  864.0
  866.0
  868.0
    ⋮
 1032.0
 1034.0
 1036.0
 1038.0
 1040.0
 1042.0
 1044.0
 1046.0
 1048.0

Preprocessing

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
@head Xp = transf(model, X) 
#plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

Split Tot to Train/Test

s = typ .== "train"
Xtrain = Xp[s, :] 
Ytrain = Y[s, :]
Xtest = rmrow(Xp, s)
Ytest = rmrow(Y, s)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
ntot = ntrain + ntest
(ntot = ntot, ntrain, ntest)

(ntot = 178, ntrain = 115, ntest = 63)

Working response y

namy = names(Y)[1:3]
j = 2  
nam = namy[j]    # work on the j-th y-variable
ytrain = Ytrain[:, nam]
ytest = Ytest[:, nam]

63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

Model fitting

gamma = 100
nlv = 15
model = kplsr(; gamma, nlv)   # For direct KPLSR (Bennett & Embrechts, 2003), use as below:
#model = dkplsr(; gamma, nlv)  
fit!(model, Xtrain, ytrain)
@names model
fitm = model.fitm
@names fitm

(:X, :Kt, :T, :C, :U, :R, :DKt, :vtot, :xscales, :ymeans, :yscales, :weights, :iter, :kwargs, :par)

Predictions

@head pred = predict(model, Xtest).pred

3×1 Matrix{Float64}:
 30.27263234256255
  1.2491123455562665
  4.4710390881702065
... (63, 1)

@head predict(model, Xtest; nlv = 2).pred

3×1 Matrix{Float64}:
 29.32135166200425
  2.162005442116719
  5.685860978803078
... (63, 1)

predict(model, Xtest; nlv = 0:2).pred

3-element Vector{Matrix{Float64}}:
 [18.358260869565218; 18.358260869565218; … ; 18.358260869565218; 18.358260869565218;;]
 [26.864333587051988; 4.1217670557698955; … ; 36.33040026195786; 45.001339268847815;;]
 [29.32135166200425; 2.162005442116719; … ; 39.208838218241105; 46.42134811761902;;]

rmsep(pred, ytest)

1×1 Matrix{Float64}:
 0.5511112753976203

bias(pred, ytest)

1×1 Matrix{Float64}:
 -0.15413101976451102

mse(pred, ytest)

1×10 DataFrame

Row	nam	rmsep	sep	bias	cor2	r2	rpd	rpdr	mean	n
	String	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Int64
1	y1	0.551	0.529	-0.154	0.998	0.998	22.995	29.673	18.414	63

@head r = residreg(pred, ytest) # residuals

3×1 Matrix{Float64}:
 -0.47263234256254805
  0.15088765444373342
  0.1289609118297932
... (63, 1)

Plotting predictions vs. observed data

plotxy(pred, ytest; size = (500, 400), color = (:red, .5), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Prediction", ylabel = "Observed").f

Residuals

plotxy(ytest, r; size = (500, 400), color = (:red, .5), zeros = true, title = string("Test set - variable ", nam), 
    xlabel = "Observed (Test)", ylabel = "Residuals").f

Adding a smoothing

zpred = vec(pred)
zr = vec(r)

63-element Vector{Float64}:
 -0.47263234256254805
  0.15088765444373342
  0.1289609118297932
 -0.010875159198107553
  1.4489775124476338
  0.290389476158623
  0.2844942468389391
 -0.06310162831640298
  0.22063704374842352
  0.24732308729945274
  ⋮
  0.12741438205125277
 -0.1781680431607029
  0.9736514052289138
 -0.3613955588708784
 -0.02422859343892725
  1.34464900017079
  0.09247884206364887
 -0.5730256984462514
  0.16607155085765157

Predictions

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, ytest)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, ytest; size = (500, 400), bisect = true, title = string("Test set - variable ", nam), 
    xlabel = "Predicted", ylabel = "Observed")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
f

Residuals

model_lo = loessr(span = 1/2) 
fit!(model_lo, zpred, zr)
pred_lo = predict(model_lo, sort(zpred)).pred
f, ax = plotxy(zpred, zr; size = (500, 400), title = string("Test set - variable ", nam), 
    xlabel = "Predictions", ylabel = "Residuals")
lines!(ax, sort(zpred), vec(pred_lo); color = :red)
hlines!(ax, 0; color = :grey)
f