tecator - Data description

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (178, 100)
3×100 DataFrame
Row8508528548568588608628648668688708728748768788808828848868888908928948968989009029049069089109129149169189209229249269289309329349369389409429449469489509529549569589609629649669689709729749769789809829849869889909929949969981000100210041006100810101012101410161018102010221024102610281030103210341036103810401042104410461048
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
12.617762.618142.618592.619122.619812.620712.621862.623342.625112.627222.629642.632452.635652.639332.643532.648252.65352.659372.665852.672812.680082.687332.694272.700732.706842.712812.719142.726282.734622.744162.754662.765682.776792.78792.799492.812252.827062.843562.861062.878572.894972.909242.920852.930152.938462.947712.960192.978313.003063.035063.074283.119633.168683.217713.262543.299883.328473.348993.363423.373793.381523.387413.391643.394183.39493.393663.390453.385413.378693.370413.360733.349793.337693.324433.310133.294873.278913.262323.245423.228283.21083.192873.174333.155033.134753.113393.091163.06853.045963.023933.002472.981452.960722.940132.919782.899662.879642.85962.83942.8192
22.834542.838712.842832.847052.851382.855872.86062.865662.870932.876612.882642.888982.895772.903082.910972.919532.928732.938632.949292.960722.972722.984932.99693.008333.01923.02993.041013.053453.067773.084163.102213.121063.139833.15813.176233.195193.215843.237473.258893.278353.293843.303623.306813.303933.2973.289253.284093.285053.293263.309233.332673.362513.396613.431883.464923.492953.514583.530043.540673.547973.553063.556753.559213.560453.560343.558763.555713.551323.545853.53953.532353.524423.515833.506683.4973.486833.476263.465523.455013.444813.434773.424653.414193.403033.390823.377313.362653.347453.332453.318183.304733.291863.279213.266553.253693.240453.226593.211813.1963.17942
32.582842.584582.586292.588082.589962.591922.594012.596272.598732.601312.604142.607142.610292.613612.617142.620892.624862.629092.633612.638352.64332.648382.653542.65872.663752.66882.673832.678922.684112.689372.69472.700122.705632.711412.717752.72492.733442.743272.754332.766422.779312.792722.806492.820642.835412.851212.868722.889052.912892.940882.973253.009463.04783.085543.119473.146963.166773.179383.186313.189243.18953.188013.184983.180393.174113.166113.156413.145123.132413.118433.103293.087143.070143.052373.033933.015042.995692.976122.956422.93662.916672.896552.876222.855632.834742.813612.792352.771132.750152.729562.709342.689512.670092.651122.632622.614612.597182.580342.564042.54816
Y = dat.Y
@head Y
... (178, 4)
3×4 DataFrame
Rowwaterfatproteintyp
Float64Float64Float64String
160.522.516.7train
246.040.113.5train
371.08.420.5train

X data

wlst = names(X) 
wl = parse.(Int, wlst)
100-element Vector{Int64}:
  850
  852
  854
  856
  858
  860
  862
  864
  866
  868
    ⋮
 1032
 1034
 1036
 1038
 1040
 1042
 1044
 1046
 1048
plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Preprocessing by SNV and derivation

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp
3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)
plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Y data

summ(Y).res
4×7 DataFrame
Rowvariablemeanstdminmaxnnmissing
SymbolUnion…Union…AnyAnyInt64Int64
1water63.0469.78439.376.61780
2fat18.37812.6290.949.11780
3protein17.6342.99611.021.81780
4typtestval1780
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32

Building a new variable, typ2, aggregating categories test and validation in a new category test

## Training/test (0/1) observations
typ2 = ones(Int, nro(typ))
typ2[typ .== "train"] .= 0
tab(typ2)
OrderedCollections.OrderedDict{Int64, Int64} with 2 entries:
  0 => 115
  1 => 63
summ(Y, typ2)
Class: 0
4×7 DataFrame
 Row │ variable  mean    std     min    max    n      nmissing
     │ Symbol    Union…  Union…  Any    Any    Int64  Int64
─────┼─────────────────────────────────────────────────────────
   1 │ water     63.142  9.846   39.3   76.6     115         0
   2 │ fat       18.358  12.605  0.9    49.1     115         0
   3 │ protein   17.597  2.989   11.0   21.8     115         0
   4 │ typ                       train  train    115         0


Class: 1
4×7 DataFrame
 Row │ variable  mean    std     min   max   n      nmissing
     │ Symbol    Union…  Union…  Any   Any   Int64  Int64
─────┼───────────────────────────────────────────────────────
   1 │ water     62.87   9.745   40.7  76.1     63         0
   2 │ fat       18.414  12.774  1.4   47.8     63         0
   3 │ protein   17.702  3.031   11.1  21.6     63         0
   4 │ typ                       test  val      63         0

A given variable

namy = names(Y)[1:3]
j = 2
nam = namy[2]
y = Y[:, nam]
s = typ2 .== 0
ytrain = y[s] 
ytest = rmrow(y, s)
63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [10; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam),  ylabel = "Nb. observations", 
    yticks = (offs, ["Train"; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density")
bdw = .5 
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)  
f
f = Figure(size = (500, 400))
offs = [.15; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density", 
    yticks = (offs, ["Train"; "Test"]))
bdw = .5
density!(ax, ytrain; bandwidth = bdw, offset = offs[1], color = (:slategray, 0.5))
density!(ax, ytest; bandwidth = bdw, offset = offs[2], color = (:slategray, 0.5))
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = uppercase(nam))
boxplot!(ax, typ2, y; width = .3, show_notch = true)
f