challenge2018 - Data description

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie
using FreqTables

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/challenge2018.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (4075, 680)
3×680 DataFrame
580 columns omitted
Row1120112211241126112811301132113411361138114011421144114611481150115211541156115811601162116411661168117011721174117611781180118211841186118811901192119411961198120012021204120612081210121212141216121812201222122412261228123012321234123612381240124212441246124812501252125412561258126012621264126612681270127212741276127812801282128412861288129012921294129612981300130213041306130813101312131413161318
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.5974820.5959780.5936230.590840.5874510.5830920.5786660.5721340.5661250.5602040.5516940.5443390.5370080.5283120.5208280.512280.5046730.4982040.489680.4831760.4775880.4703840.4648440.4592690.4540230.44960.4442710.4400260.4365140.4318220.4282010.4248810.4213790.418590.4155560.4131930.4115010.409460.4082540.4075030.4069480.4069030.4072110.4079410.4088350.4104950.4122110.414120.4166910.4191780.4219020.4246320.4270360.430220.4328090.4349960.4378230.4399630.4420410.4439820.4455780.4473950.4487850.4498290.4511650.4521310.4530350.4538980.4545730.4553250.4559320.4564390.457060.4575880.4581210.4587120.4592530.4599680.4606280.4612450.4621040.4628490.4636040.4643670.4649760.4656140.4660880.4663150.4665680.4665530.4663940.4659810.4654640.4645220.4635110.4623480.4606480.4588070.4567570.454314
20.9541920.9532370.9520020.9504260.9483820.9461380.9435430.940570.9375140.9343360.9311810.928030.9249790.9220760.9192040.9162520.9132280.910060.9065910.9029340.8990690.8949740.8907860.8865130.8822970.8781780.8740640.8701320.8664040.8627240.8593570.8562110.8533590.8509620.8489010.847420.8463610.8456150.8451780.8450540.8450310.8452450.8455490.8459770.8464630.8471180.8478080.8485980.8493670.8502030.8511150.851950.8527740.8536230.8546170.8555110.8564260.8572820.8582280.8591450.8600360.8608690.8617740.8626260.8634310.8641850.8647860.8652540.8656120.8659360.8661310.8662940.86630.8662690.866280.8663260.8664430.8666140.8669670.8673660.8678520.8685330.8692120.8698940.8705840.8712960.8720080.8726120.8731830.8736330.8740320.8743370.874550.874610.8746560.8745540.874320.8739820.8735130.872882
30.6111370.6095660.607430.6047670.6014340.5973160.5925980.5869920.5807410.5741890.5669430.5593810.5517730.5437530.5359770.528180.5204820.5134550.5065980.5002360.4944690.4889220.4835790.4785120.4733640.4686190.4639140.4593650.4553310.4513570.4477380.4443930.44120.438310.4356520.4331470.4311420.4294170.4281370.4273580.4270510.4271960.4278310.4288730.430280.4320350.4341760.4364010.4389860.4415280.4442080.4469580.4495880.4523390.4550440.4575990.4601460.4625680.4647610.4669010.4687190.470450.4720010.4733090.474530.4756240.4765670.4774690.4782180.4789390.4795890.4801720.4807710.4813250.481870.4824730.4830490.4837020.4844010.4850880.4858350.4865430.4872390.4879190.4884780.4889710.4893650.4895970.4896970.489580.4893090.4888070.4881280.4871650.4859270.4845020.482770.4807470.478510.475817
Y = dat.Y
@head Y
... (4075, 4)
3×4 DataFrame
Rowtyplabelconctest
StringStringFloat64Int64
1FRGwheat (ung)12.740
2MPWmilk powder & whey35.72120
3FRGwheat (ung)12.00

X data

wlst = names(X) 
wl = parse.(Int, wlst)
680-element Vector{Int64}:
 1120
 1122
 1124
 1126
 1128
 1130
 1132
 1134
 1136
 1138
    ⋮
 2462
 2464
 2466
 2468
 2470
 2472
 2474
 2476
 2478
plotsp(X, wl; nsamp = 500, xlabel = "Wavelength (nm)").f

Preprocessing by SNV and derivation

model1 = snv()
model2 = savgol(npoint = 21, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp
3×680 Matrix{Float64}:
 -0.00393533  -0.00441755  -0.00477681  …  0.000514478  0.000481081
 -0.00121436  -0.0013095   -0.00135921     0.000996321  0.000930884
 -0.00355712  -0.00399785  -0.00434838     0.00044084   0.000421751
... (4075, 680)
plotsp(Xp, wl; nsamp = 500, xlabel = "Wavelength (nm)").f

Y data

typ = Y.typ
freqtable(string.(typ, " - ", Y.label))
10-element Named Vector{Int64}
Dim1                      │ 
──────────────────────────┼────
ANF - animal feed         │ 391
CLZ - rapeseed(ung)       │ 420
CNG - corn gluten         │ 395
EHH - grass silage        │ 422
FFS - full fat soya       │ 432
FRG - wheat (ung)         │ 411
MPW - milk powder & whey  │ 410
PEE - maize wp            │ 407
SFG - sun flower seed(gr) │ 281
TTS - soya meal           │ 506
test = Y.test  # training/test (0/1) observations
tab(test) 
freqtable(typ, test)
10×2 Named Matrix{Int64}
Dim1 ╲ Dim2 │   0    1
────────────┼─────────
ANF         │ 351   40
CLZ         │ 378   42
CNG         │ 356   39
EHH         │ 380   42
FFS         │ 397   35
FRG         │ 371   40
MPW         │ 372   38
PEE         │ 367   40
SFG         │ 272    9
TTS         │ 457   49
y = Y.conc # protein concentration
summ(y).res
1×7 DataFrame
Rowvariablemeanstdminmaxnnmissing
SymbolFloat64Float64Float64Float64Int64Int64
1x131.9320.3482.76676.60440750
summ(y, test)
Class: 0
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1         31.894   20.297    3.061   76.604   3701         0


Class: 1
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1         32.288   20.874    2.766  75.8559    374         0
s = test .== 0
ytrain = y[s]  
ytest = rmrow(y, s)
374-element Vector{Float64}:
 19.7999992
  6.71
 11.8599997
 53.4160156
 27.4345245
 54.562149
 40.206295
 21.3299999
  8.4799995
 20.5900002
  ⋮
 65.1012344
 56.1860695
 50.8610802
  8.1399994
 13.3100004
 11.6800003
 18.2399998
 67.6700745
 25.0300007
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [70; 0]
ax = Axis(f[1, 1]; xlabel = "Protein",  ylabel = "Nb. observations", 
    yticks = (offs, ["Train"; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Density")
bdw = .5
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)  
f
f = Figure(size = (400, 300))
offs = [.10; 0]
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Density", 
    yticks = (offs, ["Train"; "Test"]))
bdw = .5
density!(ax, ytrain; bandwidth = bdw, offset = offs[1], color = (:slategray, 0.5))
density!(ax, ytest; bandwidth = bdw, offset = offs[2], color = (:slategray, 0.5))
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = "Protein")
boxplot!(ax, test, y; width = .3, show_notch = true)
f