Manifold - challenge2018 - Pca

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie, GLMakie
using FreqTables

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/challenge2018.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (4075, 680)
3×680 DataFrame
580 columns omitted
Row1120112211241126112811301132113411361138114011421144114611481150115211541156115811601162116411661168117011721174117611781180118211841186118811901192119411961198120012021204120612081210121212141216121812201222122412261228123012321234123612381240124212441246124812501252125412561258126012621264126612681270127212741276127812801282128412861288129012921294129612981300130213041306130813101312131413161318
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.5974820.5959780.5936230.590840.5874510.5830920.5786660.5721340.5661250.5602040.5516940.5443390.5370080.5283120.5208280.512280.5046730.4982040.489680.4831760.4775880.4703840.4648440.4592690.4540230.44960.4442710.4400260.4365140.4318220.4282010.4248810.4213790.418590.4155560.4131930.4115010.409460.4082540.4075030.4069480.4069030.4072110.4079410.4088350.4104950.4122110.414120.4166910.4191780.4219020.4246320.4270360.430220.4328090.4349960.4378230.4399630.4420410.4439820.4455780.4473950.4487850.4498290.4511650.4521310.4530350.4538980.4545730.4553250.4559320.4564390.457060.4575880.4581210.4587120.4592530.4599680.4606280.4612450.4621040.4628490.4636040.4643670.4649760.4656140.4660880.4663150.4665680.4665530.4663940.4659810.4654640.4645220.4635110.4623480.4606480.4588070.4567570.454314
20.9541920.9532370.9520020.9504260.9483820.9461380.9435430.940570.9375140.9343360.9311810.928030.9249790.9220760.9192040.9162520.9132280.910060.9065910.9029340.8990690.8949740.8907860.8865130.8822970.8781780.8740640.8701320.8664040.8627240.8593570.8562110.8533590.8509620.8489010.847420.8463610.8456150.8451780.8450540.8450310.8452450.8455490.8459770.8464630.8471180.8478080.8485980.8493670.8502030.8511150.851950.8527740.8536230.8546170.8555110.8564260.8572820.8582280.8591450.8600360.8608690.8617740.8626260.8634310.8641850.8647860.8652540.8656120.8659360.8661310.8662940.86630.8662690.866280.8663260.8664430.8666140.8669670.8673660.8678520.8685330.8692120.8698940.8705840.8712960.8720080.8726120.8731830.8736330.8740320.8743370.874550.874610.8746560.8745540.874320.8739820.8735130.872882
30.6111370.6095660.607430.6047670.6014340.5973160.5925980.5869920.5807410.5741890.5669430.5593810.5517730.5437530.5359770.528180.5204820.5134550.5065980.5002360.4944690.4889220.4835790.4785120.4733640.4686190.4639140.4593650.4553310.4513570.4477380.4443930.44120.438310.4356520.4331470.4311420.4294170.4281370.4273580.4270510.4271960.4278310.4288730.430280.4320350.4341760.4364010.4389860.4415280.4442080.4469580.4495880.4523390.4550440.4575990.4601460.4625680.4647610.4669010.4687190.470450.4720010.4733090.474530.4756240.4765670.4774690.4782180.4789390.4795890.4801720.4807710.4813250.481870.4824730.4830490.4837020.4844010.4850880.4858350.4865430.4872390.4879190.4884780.4889710.4893650.4895970.4896970.489580.4893090.4888070.4881280.4871650.4859270.4845020.482770.4807470.478510.475817
Y = dat.Y
@head Y
... (4075, 4)
3×4 DataFrame
Rowtyplabelconctest
StringStringFloat64Int64
1FRGwheat (ung)12.740
2MPWmilk powder & whey35.72120
3FRGwheat (ung)12.00
wlst = names(X) 
wl = parse.(Int, wlst)
680-element Vector{Int64}:
 1120
 1122
 1124
 1126
 1128
 1130
 1132
 1134
 1136
 1138
    ⋮
 2462
 2464
 2466
 2468
 2470
 2472
 2474
 2476
 2478
plotsp(X, wl; nsamp = 500, xlabel = "Wavelength (nm)").f

Preprocessing by SNV and derivation

model1 = snv()
model2 = savgol(npoint = 21, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp
3×680 Matrix{Float64}:
 -0.00393533  -0.00441755  -0.00477681  …  0.000514478  0.000481081
 -0.00121436  -0.0013095   -0.00135921     0.000996321  0.000930884
 -0.00355712  -0.00399785  -0.00434838     0.00044084   0.000421751
... (4075, 680)
plotsp(Xp, wl; nsamp = 500, xlabel = "Wavelength (nm)").f

Types of materials

typ = Y.typ
freqtable(string.(typ, " - ", Y.label))
10-element Named Vector{Int64}
Dim1                      │ 
──────────────────────────┼────
ANF - animal feed         │ 391
CLZ - rapeseed(ung)       │ 420
CNG - corn gluten         │ 395
EHH - grass silage        │ 422
FFS - full fat soya       │ 432
FRG - wheat (ung)         │ 411
MPW - milk powder & whey  │ 410
PEE - maize wp            │ 407
SFG - sun flower seed(gr) │ 281
TTS - soya meal           │ 506
test = Y.test  # training/test (0/1) observations
tab(test) 
freqtable(typ, test)
10×2 Named Matrix{Int64}
Dim1 ╲ Dim2 │   0    1
────────────┼─────────
ANF         │ 351   40
CLZ         │ 378   42
CNG         │ 356   39
EHH         │ 380   42
FFS         │ 397   35
FRG         │ 371   40
MPW         │ 372   38
PEE         │ 367   40
SFG         │ 272    9
TTS         │ 457   49

Split Train/Test from Tot

s = Bool.(test) # same as: s = Y.test .== 1
Xtrain = rmrow(Xp, s)
typtrain = rmrow(typ, s)
Xtest = Xp[s, :]
typtest = typ[s]
ntot = nro(X)
ntrain = nro(Xtrain)
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)
(ntot = 4075, ntrain = 3701, ntest = 374)

Pca on Train

nlv = 3
model = pcasvd(; nlv)
fit!(model, Xtrain)
@head T = model.fitm.T
3×3 Matrix{Float64}:
 -0.00444931   0.0171604    0.00192674
 -0.0119228   -0.00286706   0.00500189
 -0.00391203   0.0180218   -0.000975336
... (3701, 3)
CairoMakie.activate!()  
#GLMakie.activate!()    # for interactive axe-rotation
i = 1
plotxyz(T[:, i], T[:, i + 1], T[:, i + 2]; size = (600, 500), color = (:red, .3), markersize = 10, 
    xlabel = string("PC", i), ylabel = string("PC", i + 1), zlabel = string("PC", i + 2), 
    title = "Pca score space").f
lev = mlev(typtrain)
nlev = length(lev)
colm = cgrad(:tab10, nlev; categorical = true, alpha = .5)
i = 1
plotxyz(T[:, i], T[:, i + 1], T[:, i + 2], typtrain; size = (700, 500), color = colm, markersize = 10, 
    xlabel = string("PC", i), ylabel = string("PC", i + 1), zlabel = string("PC", i + 2), 
    title = "Pca score space").f

Projection of the test observations

@head Ttest = transf(model, Xtest)
3×3 Matrix{Float64}:
  0.0126087    0.0102876    0.00597325
 -0.00690901  -0.00507066   0.00472288
 -0.00284267   0.0236669   -7.55497e-5
... (374, 3)
f, ax = plotxyz(T[:, i], T[:, i + 1], T[:, i + 2], typtrain; size = (700, 500), color = colm, markersize = 10, 
    xlabel = string("PC", i), ylabel = string("PC", i + 1), zlabel = string("PC", i + 2), 
    title = "Pca score space")
scatter!(ax, Ttest[:, i], Ttest[:, i + 1], Ttest[:, i + 2]; markersize = 6, color = :black)
f