forages2 - Data description
using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie
using FreqTables
Data importation
path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/forages2.jld2")
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (485, 700)
3×700 DataFrame
600 columns omitted
1 | -0.000231591 | -0.000175945 | -8.48176e-5 | 2.05217e-5 | 0.000110094 | 0.000161757 | 0.000154953 | 0.000163754 | 0.000187602 | 0.00021499 | 0.000242479 | 0.000265498 | 0.000282141 | 0.000281442 | 0.000271025 | 0.000261075 | 0.000257284 | 0.000252177 | 0.00024293 | 0.000228295 | 0.000219097 | 0.000214136 | 0.000215612 | 0.000218982 | 0.000228004 | 0.000236081 | 0.000236017 | 0.000220327 | 0.000187096 | 0.000137138 | 7.68593e-5 | 1.13679e-5 | -5.00951e-5 | -9.54664e-5 | -0.000119199 | -0.000131897 | -0.000142349 | -0.000161489 | -0.00019387 | -0.000244808 | -0.000303259 | -0.000366904 | -0.000416738 | -0.000451535 | -0.00046995 | -0.000478637 | -0.000477348 | -0.000478142 | -0.000476719 | -0.000479701 | -0.000482037 | -0.000496769 | -0.000511959 | -0.000532094 | -0.000542661 | -0.000540188 | -0.000512715 | -0.00045798 | -0.000370395 | -0.000256256 | -0.000126907 | 1.13716e-6 | 0.000119047 | 0.000212745 | 0.000275685 | 0.000307863 | 0.000313547 | 0.000296977 | 0.000269661 | 0.000247818 | 0.000233944 | 0.000228773 | 0.000224567 | 0.000221256 | 0.000218893 | 0.000217741 | 0.000210144 | 0.00019664 | 0.000181949 | 0.000169774 | 0.000151691 | 0.00012385 | 9.23378e-5 | 5.9959e-5 | 2.58352e-5 | -4.77314e-6 | -3.21835e-5 | -5.53154e-5 | -6.71707e-5 | -6.54166e-5 | -5.16448e-5 | -2.43366e-5 | 1.12255e-5 | 4.68917e-5 | 7.773e-5 | 0.000106785 | 0.000133173 | 0.000153607 | 0.000168518 | 0.000182591 | ⋯ |
2 | -9.66352e-5 | -3.30928e-5 | 5.64966e-5 | 0.000154135 | 0.000237725 | 0.000295789 | 0.000319587 | 0.000357405 | 0.000404611 | 0.000447996 | 0.000479786 | 0.000488339 | 0.000465929 | 0.000402301 | 0.000313648 | 0.000220226 | 0.000138483 | 7.35084e-5 | 3.50018e-5 | 2.83293e-5 | 6.05478e-5 | 0.000118272 | 0.000187726 | 0.000249842 | 0.00029697 | 0.000315062 | 0.000298828 | 0.000251643 | 0.000187055 | 0.000118243 | 5.60849e-5 | 3.8727e-6 | -3.28778e-5 | -4.84688e-5 | -4.38912e-5 | -3.34954e-5 | -2.72637e-5 | -3.65483e-5 | -6.62949e-5 | -0.000121833 | -0.000193587 | -0.000280244 | -0.000362132 | -0.000434981 | -0.000494461 | -0.000546531 | -0.000590606 | -0.000638514 | -0.000684688 | -0.000734688 | -0.000783664 | -0.000842714 | -0.000892596 | -0.000930301 | -0.000938118 | -0.000913585 | -0.000846217 | -0.000737781 | -0.000588122 | -0.000410395 | -0.000220611 | -3.69382e-5 | 0.000131072 | 0.000266078 | 0.000358377 | 0.000408684 | 0.000424528 | 0.000412147 | 0.000383896 | 0.000357957 | 0.000338385 | 0.000326749 | 0.000315572 | 0.00030542 | 0.000293671 | 0.000280005 | 0.000259482 | 0.000233697 | 0.0002044 | 0.000177199 | 0.000147989 | 0.000112325 | 7.33317e-5 | 3.48779e-5 | -2.5229e-6 | -3.27922e-5 | -5.52233e-5 | -7.06412e-5 | -7.49675e-5 | -6.44041e-5 | -4.04393e-5 | -6.50489e-6 | 3.09196e-5 | 6.87358e-5 | 0.000105202 | 0.000142313 | 0.000177182 | 0.000206652 | 0.000230788 | 0.000253703 | ⋯ |
3 | -0.000131769 | -7.8398e-5 | 7.92223e-7 | 8.90044e-5 | 0.000160022 | 0.000198435 | 0.000196598 | 0.000212225 | 0.000241109 | 0.000271235 | 0.000301045 | 0.000324921 | 0.000337619 | 0.000325857 | 0.00029979 | 0.000277167 | 0.00027018 | 0.00027165 | 0.000277606 | 0.000287722 | 0.000308203 | 0.000324847 | 0.000328573 | 0.000310806 | 0.00027728 | 0.000226898 | 0.000160474 | 8.30948e-5 | 7.98825e-6 | -5.32827e-5 | -9.57157e-5 | -0.000123438 | -0.0001371 | -0.000134382 | -0.00011527 | -9.07963e-5 | -6.97458e-5 | -6.29138e-5 | -7.14491e-5 | -9.85941e-5 | -0.000137562 | -0.000192678 | -0.000248177 | -0.000303993 | -0.000356125 | -0.000407616 | -0.0004553 | -0.000507819 | -0.000555473 | -0.000603436 | -0.000647099 | -0.000701763 | -0.000754429 | -0.000806879 | -0.000838493 | -0.000842167 | -0.000803445 | -0.000720829 | -0.000592138 | -0.000428566 | -0.000245567 | -6.43964e-5 | 0.000101193 | 0.000232242 | 0.000322133 | 0.000373605 | 0.000391817 | 0.000379332 | 0.000347829 | 0.000316495 | 0.000292236 | 0.000278431 | 0.000264621 | 0.000250305 | 0.000239387 | 0.000234504 | 0.000224633 | 0.000205684 | 0.000180408 | 0.000157615 | 0.000135108 | 0.000106871 | 7.3258e-5 | 3.90321e-5 | 7.34127e-6 | -1.78231e-5 | -3.94282e-5 | -5.6427e-5 | -6.15935e-5 | -5.19038e-5 | -2.96367e-5 | 3.09722e-6 | 3.98752e-5 | 7.62892e-5 | 0.000108271 | 0.000137632 | 0.000165624 | 0.000191182 | 0.000211586 | 0.000229586 | ⋯ |
Y = dat.Y
@head Y
... (485, 4)
1 | 92.23 | 37.58 | Legume forages | 1 |
2 | 93.26 | 49.6462 | Legume forages | 0 |
3 | 92.9 | 63.2939 | Forage trees | 0 |
X data
wlst = names(X)
wl = parse.(Int, wlst)
700-element Vector{Int64}:
1100
1102
1104
1106
1108
1110
1112
1114
1116
1118
⋮
2482
2484
2486
2488
2490
2492
2494
2496
2498
The X-data are already preprocessed by SNV and second derivation.
plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
Y data
typ = Y.typ
tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"Cereal and grass forages" => 160
"Forage trees" => 101
"Legume forages" => 224
test = Y.test # training/test (0/1) observations
tab(test)
freqtable(Y.typ, test)
3×2 Named Matrix{Int64}
Dim1 ╲ Dim2 │ 0 1
─────────────────────────┼─────────
Cereal and grass forages │ 100 60
Forage trees │ 56 45
Legume forages │ 167 57
namy = names(Y)[1:2]
summ(Y[:, namy]).res
1 | dm | 92.1 | 1.988 | 80.903 | 96.27 | 485 | 0 |
2 | ndf | 50.736 | 13.204 | 18.425 | 85.7235 | 485 | 0 |
summ(Y[:, namy], test)
Class: 0
2×7 DataFrame
Row │ variable mean std min max n nmissing
│ Symbol Float64 Float64 Float64 Float64 Int64 Int64
─────┼───────────────────────────────────────────────────────────────
1 │ dm 92.022 1.992 84.25 96.27 323 0
2 │ ndf 50.988 13.036 18.425 83.3012 323 0
Class: 1
2×7 DataFrame
Row │ variable mean std min max n nmissing
│ Symbol Float64 Float64 Float64 Float64 Int64 Int64
─────┼───────────────────────────────────────────────────────────────
1 │ dm 92.257 1.976 80.903 96.18 162 0
2 │ ndf 50.233 13.56 22.959 85.7235 162 0
A given variable
j = 2
nam = namy[2]
y = Y[:, nam]
s = test .== 0
ytrain = y[s]
ytest = rmrow(y, s)
162-element Vector{Union{Missing, Float64}}:
37.579963135639154
56.99851095511593
47.21391972672929
25.309323906319047
54.28931875525652
80.29120067524794
57.55875502858353
32.66967330326697
61.936710276193665
46.46783498421683
⋮
48.90913691567336
69.05194981637266
45.154438712841326
41.50911821540868
52.42695116229647
22.959435724677174
35.71112405578161
55.53583902046241
34.64568924391239
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [20; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations",
yticks = (offs, ["Train" ; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density")
bdw = 1
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [.1; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density",
yticks = (offs, ["Train" ; "Test"]))
bdw = 1
density!(ax, ytrain; offset = offs[1], color = (:slategray, 0.5), bandwidth = bdw)
density!(ax, ytest; offset = offs[2], color = (:slategray, 0.5), bandwidth = bdw)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = uppercase(nam))
boxplot!(ax, test, y; width = .3, show_notch = true)
f