forages2 - Data description

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie
using FreqTables

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/forages2.jld2") 
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (485, 700)
3×700 DataFrame
600 columns omitted
Row1100110211041106110811101112111411161118112011221124112611281130113211341136113811401142114411461148115011521154115611581160116211641166116811701172117411761178118011821184118611881190119211941196119812001202120412061208121012121214121612181220122212241226122812301232123412361238124012421244124612481250125212541256125812601262126412661268127012721274127612781280128212841286128812901292129412961298
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
1-0.000231591-0.000175945-8.48176e-52.05217e-50.0001100940.0001617570.0001549530.0001637540.0001876020.000214990.0002424790.0002654980.0002821410.0002814420.0002710250.0002610750.0002572840.0002521770.000242930.0002282950.0002190970.0002141360.0002156120.0002189820.0002280040.0002360810.0002360170.0002203270.0001870960.0001371387.68593e-51.13679e-5-5.00951e-5-9.54664e-5-0.000119199-0.000131897-0.000142349-0.000161489-0.00019387-0.000244808-0.000303259-0.000366904-0.000416738-0.000451535-0.00046995-0.000478637-0.000477348-0.000478142-0.000476719-0.000479701-0.000482037-0.000496769-0.000511959-0.000532094-0.000542661-0.000540188-0.000512715-0.00045798-0.000370395-0.000256256-0.0001269071.13716e-60.0001190470.0002127450.0002756850.0003078630.0003135470.0002969770.0002696610.0002478180.0002339440.0002287730.0002245670.0002212560.0002188930.0002177410.0002101440.000196640.0001819490.0001697740.0001516910.000123859.23378e-55.9959e-52.58352e-5-4.77314e-6-3.21835e-5-5.53154e-5-6.71707e-5-6.54166e-5-5.16448e-5-2.43366e-51.12255e-54.68917e-57.773e-50.0001067850.0001331730.0001536070.0001685180.000182591
2-9.66352e-5-3.30928e-55.64966e-50.0001541350.0002377250.0002957890.0003195870.0003574050.0004046110.0004479960.0004797860.0004883390.0004659290.0004023010.0003136480.0002202260.0001384837.35084e-53.50018e-52.83293e-56.05478e-50.0001182720.0001877260.0002498420.000296970.0003150620.0002988280.0002516430.0001870550.0001182435.60849e-53.8727e-6-3.28778e-5-4.84688e-5-4.38912e-5-3.34954e-5-2.72637e-5-3.65483e-5-6.62949e-5-0.000121833-0.000193587-0.000280244-0.000362132-0.000434981-0.000494461-0.000546531-0.000590606-0.000638514-0.000684688-0.000734688-0.000783664-0.000842714-0.000892596-0.000930301-0.000938118-0.000913585-0.000846217-0.000737781-0.000588122-0.000410395-0.000220611-3.69382e-50.0001310720.0002660780.0003583770.0004086840.0004245280.0004121470.0003838960.0003579570.0003383850.0003267490.0003155720.000305420.0002936710.0002800050.0002594820.0002336970.00020440.0001771990.0001479890.0001123257.33317e-53.48779e-5-2.5229e-6-3.27922e-5-5.52233e-5-7.06412e-5-7.49675e-5-6.44041e-5-4.04393e-5-6.50489e-63.09196e-56.87358e-50.0001052020.0001423130.0001771820.0002066520.0002307880.000253703
3-0.000131769-7.8398e-57.92223e-78.90044e-50.0001600220.0001984350.0001965980.0002122250.0002411090.0002712350.0003010450.0003249210.0003376190.0003258570.000299790.0002771670.000270180.000271650.0002776060.0002877220.0003082030.0003248470.0003285730.0003108060.000277280.0002268980.0001604748.30948e-57.98825e-6-5.32827e-5-9.57157e-5-0.000123438-0.0001371-0.000134382-0.00011527-9.07963e-5-6.97458e-5-6.29138e-5-7.14491e-5-9.85941e-5-0.000137562-0.000192678-0.000248177-0.000303993-0.000356125-0.000407616-0.0004553-0.000507819-0.000555473-0.000603436-0.000647099-0.000701763-0.000754429-0.000806879-0.000838493-0.000842167-0.000803445-0.000720829-0.000592138-0.000428566-0.000245567-6.43964e-50.0001011930.0002322420.0003221330.0003736050.0003918170.0003793320.0003478290.0003164950.0002922360.0002784310.0002646210.0002503050.0002393870.0002345040.0002246330.0002056840.0001804080.0001576150.0001351080.0001068717.3258e-53.90321e-57.34127e-6-1.78231e-5-3.94282e-5-5.6427e-5-6.15935e-5-5.19038e-5-2.96367e-53.09722e-63.98752e-57.62892e-50.0001082710.0001376320.0001656240.0001911820.0002115860.000229586
Y = dat.Y
@head Y
... (485, 4)
3×4 DataFrame
Rowdmndftyptest
Float64?Float64?StringInt64
192.2337.58Legume forages1
293.2649.6462Legume forages0
392.963.2939Forage trees0

X data

wlst = names(X) 
wl = parse.(Int, wlst)
700-element Vector{Int64}:
 1100
 1102
 1104
 1106
 1108
 1110
 1112
 1114
 1116
 1118
    ⋮
 2482
 2484
 2486
 2488
 2490
 2492
 2494
 2496
 2498

The X-data are already preprocessed by SNV and second derivation.

plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Y data

typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "Cereal and grass forages" => 160
  "Forage trees"             => 101
  "Legume forages"           => 224
test = Y.test  # training/test (0/1) observations
tab(test)
freqtable(Y.typ, test)
3×2 Named Matrix{Int64}
             Dim1 ╲ Dim2 │   0    1
─────────────────────────┼─────────
Cereal and grass forages │ 100   60
Forage trees             │  56   45
Legume forages           │ 167   57
namy = names(Y)[1:2]
summ(Y[:, namy]).res
2×7 DataFrame
Rowvariablemeanstdminmaxnnmissing
SymbolFloat64Float64Float64Float64Int64Int64
1dm92.11.98880.90396.274850
2ndf50.73613.20418.42585.72354850
summ(Y[:, namy], test)
Class: 0
2×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ dm         92.022    1.992   84.25   96.27      323         0
   2 │ ndf        50.988   13.036   18.425  83.3012    323         0


Class: 1
2×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ dm         92.257    1.976   80.903  96.18      162         0
   2 │ ndf        50.233   13.56    22.959  85.7235    162         0

A given variable

j = 2
nam = namy[2]
y = Y[:, nam]
s = test .== 0
ytrain = y[s] 
ytest = rmrow(y, s)
162-element Vector{Union{Missing, Float64}}:
 37.579963135639154
 56.99851095511593
 47.21391972672929
 25.309323906319047
 54.28931875525652
 80.29120067524794
 57.55875502858353
 32.66967330326697
 61.936710276193665
 46.46783498421683
  ⋮
 48.90913691567336
 69.05194981637266
 45.154438712841326
 41.50911821540868
 52.42695116229647
 22.959435724677174
 35.71112405578161
 55.53583902046241
 34.64568924391239
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [20; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam),  ylabel = "Nb. observations", 
    yticks = (offs, ["Train" ; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density")
bdw = 1
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)  
f
f = Figure(size = (500, 400))
offs = [.1; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density", 
    yticks = (offs, ["Train" ; "Test"]))
bdw = 1
density!(ax, ytrain; offset = offs[1], color = (:slategray, 0.5), bandwidth = bdw)
density!(ax, ytest; offset = offs[2], color = (:slategray, 0.5), bandwidth = bdw)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = uppercase(nam))
boxplot!(ax, test, y; width = .3, show_notch = true)
f