challenge2018 - Data description
using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie
using FreqTables
Data importation
path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/challenge2018.jld2")
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (4075, 680)
3×680 DataFrame
580 columns omitted
1 | 0.597482 | 0.595978 | 0.593623 | 0.59084 | 0.587451 | 0.583092 | 0.578666 | 0.572134 | 0.566125 | 0.560204 | 0.551694 | 0.544339 | 0.537008 | 0.528312 | 0.520828 | 0.51228 | 0.504673 | 0.498204 | 0.48968 | 0.483176 | 0.477588 | 0.470384 | 0.464844 | 0.459269 | 0.454023 | 0.4496 | 0.444271 | 0.440026 | 0.436514 | 0.431822 | 0.428201 | 0.424881 | 0.421379 | 0.41859 | 0.415556 | 0.413193 | 0.411501 | 0.40946 | 0.408254 | 0.407503 | 0.406948 | 0.406903 | 0.407211 | 0.407941 | 0.408835 | 0.410495 | 0.412211 | 0.41412 | 0.416691 | 0.419178 | 0.421902 | 0.424632 | 0.427036 | 0.43022 | 0.432809 | 0.434996 | 0.437823 | 0.439963 | 0.442041 | 0.443982 | 0.445578 | 0.447395 | 0.448785 | 0.449829 | 0.451165 | 0.452131 | 0.453035 | 0.453898 | 0.454573 | 0.455325 | 0.455932 | 0.456439 | 0.45706 | 0.457588 | 0.458121 | 0.458712 | 0.459253 | 0.459968 | 0.460628 | 0.461245 | 0.462104 | 0.462849 | 0.463604 | 0.464367 | 0.464976 | 0.465614 | 0.466088 | 0.466315 | 0.466568 | 0.466553 | 0.466394 | 0.465981 | 0.465464 | 0.464522 | 0.463511 | 0.462348 | 0.460648 | 0.458807 | 0.456757 | 0.454314 | ⋯ |
2 | 0.954192 | 0.953237 | 0.952002 | 0.950426 | 0.948382 | 0.946138 | 0.943543 | 0.94057 | 0.937514 | 0.934336 | 0.931181 | 0.92803 | 0.924979 | 0.922076 | 0.919204 | 0.916252 | 0.913228 | 0.91006 | 0.906591 | 0.902934 | 0.899069 | 0.894974 | 0.890786 | 0.886513 | 0.882297 | 0.878178 | 0.874064 | 0.870132 | 0.866404 | 0.862724 | 0.859357 | 0.856211 | 0.853359 | 0.850962 | 0.848901 | 0.84742 | 0.846361 | 0.845615 | 0.845178 | 0.845054 | 0.845031 | 0.845245 | 0.845549 | 0.845977 | 0.846463 | 0.847118 | 0.847808 | 0.848598 | 0.849367 | 0.850203 | 0.851115 | 0.85195 | 0.852774 | 0.853623 | 0.854617 | 0.855511 | 0.856426 | 0.857282 | 0.858228 | 0.859145 | 0.860036 | 0.860869 | 0.861774 | 0.862626 | 0.863431 | 0.864185 | 0.864786 | 0.865254 | 0.865612 | 0.865936 | 0.866131 | 0.866294 | 0.8663 | 0.866269 | 0.86628 | 0.866326 | 0.866443 | 0.866614 | 0.866967 | 0.867366 | 0.867852 | 0.868533 | 0.869212 | 0.869894 | 0.870584 | 0.871296 | 0.872008 | 0.872612 | 0.873183 | 0.873633 | 0.874032 | 0.874337 | 0.87455 | 0.87461 | 0.874656 | 0.874554 | 0.87432 | 0.873982 | 0.873513 | 0.872882 | ⋯ |
3 | 0.611137 | 0.609566 | 0.60743 | 0.604767 | 0.601434 | 0.597316 | 0.592598 | 0.586992 | 0.580741 | 0.574189 | 0.566943 | 0.559381 | 0.551773 | 0.543753 | 0.535977 | 0.52818 | 0.520482 | 0.513455 | 0.506598 | 0.500236 | 0.494469 | 0.488922 | 0.483579 | 0.478512 | 0.473364 | 0.468619 | 0.463914 | 0.459365 | 0.455331 | 0.451357 | 0.447738 | 0.444393 | 0.4412 | 0.43831 | 0.435652 | 0.433147 | 0.431142 | 0.429417 | 0.428137 | 0.427358 | 0.427051 | 0.427196 | 0.427831 | 0.428873 | 0.43028 | 0.432035 | 0.434176 | 0.436401 | 0.438986 | 0.441528 | 0.444208 | 0.446958 | 0.449588 | 0.452339 | 0.455044 | 0.457599 | 0.460146 | 0.462568 | 0.464761 | 0.466901 | 0.468719 | 0.47045 | 0.472001 | 0.473309 | 0.47453 | 0.475624 | 0.476567 | 0.477469 | 0.478218 | 0.478939 | 0.479589 | 0.480172 | 0.480771 | 0.481325 | 0.48187 | 0.482473 | 0.483049 | 0.483702 | 0.484401 | 0.485088 | 0.485835 | 0.486543 | 0.487239 | 0.487919 | 0.488478 | 0.488971 | 0.489365 | 0.489597 | 0.489697 | 0.48958 | 0.489309 | 0.488807 | 0.488128 | 0.487165 | 0.485927 | 0.484502 | 0.48277 | 0.480747 | 0.47851 | 0.475817 | ⋯ |
Y = dat.Y
@head Y
... (4075, 4)
1 | FRG | wheat (ung) | 12.74 | 0 |
2 | MPW | milk powder & whey | 35.7212 | 0 |
3 | FRG | wheat (ung) | 12.0 | 0 |
X data
wlst = names(X)
wl = parse.(Int, wlst)
680-element Vector{Int64}:
1120
1122
1124
1126
1128
1130
1132
1134
1136
1138
⋮
2462
2464
2466
2468
2470
2472
2474
2476
2478
plotsp(X, wl; nsamp = 500, xlabel = "Wavelength (nm)").f
Preprocessing by SNV and derivation
model1 = snv()
model2 = savgol(npoint = 21, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp
3×680 Matrix{Float64}:
-0.00393533 -0.00441755 -0.00477681 … 0.000514478 0.000481081
-0.00121436 -0.0013095 -0.00135921 0.000996321 0.000930884
-0.00355712 -0.00399785 -0.00434838 0.00044084 0.000421751
... (4075, 680)
plotsp(Xp, wl; nsamp = 500, xlabel = "Wavelength (nm)").f
Y data
typ = Y.typ
freqtable(string.(typ, " - ", Y.label))
10-element Named Vector{Int64}
Dim1 │
──────────────────────────┼────
ANF - animal feed │ 391
CLZ - rapeseed(ung) │ 420
CNG - corn gluten │ 395
EHH - grass silage │ 422
FFS - full fat soya │ 432
FRG - wheat (ung) │ 411
MPW - milk powder & whey │ 410
PEE - maize wp │ 407
SFG - sun flower seed(gr) │ 281
TTS - soya meal │ 506
test = Y.test # training/test (0/1) observations
tab(test)
freqtable(typ, test)
10×2 Named Matrix{Int64}
Dim1 ╲ Dim2 │ 0 1
────────────┼─────────
ANF │ 351 40
CLZ │ 378 42
CNG │ 356 39
EHH │ 380 42
FFS │ 397 35
FRG │ 371 40
MPW │ 372 38
PEE │ 367 40
SFG │ 272 9
TTS │ 457 49
y = Y.conc # protein concentration
summ(y).res
1 | x1 | 31.93 | 20.348 | 2.766 | 76.604 | 4075 | 0 |
summ(y, test)
Class: 0
1×7 DataFrame
Row │ variable mean std min max n nmissing
│ Symbol Float64 Float64 Float64 Float64 Int64 Int64
─────┼───────────────────────────────────────────────────────────────
1 │ x1 31.894 20.297 3.061 76.604 3701 0
Class: 1
1×7 DataFrame
Row │ variable mean std min max n nmissing
│ Symbol Float64 Float64 Float64 Float64 Int64 Int64
─────┼───────────────────────────────────────────────────────────────
1 │ x1 32.288 20.874 2.766 75.8559 374 0
s = test .== 0
ytrain = y[s]
ytest = rmrow(y, s)
374-element Vector{Float64}:
19.7999992
6.71
11.8599997
53.4160156
27.4345245
54.562149
40.206295
21.3299999
8.4799995
20.5900002
⋮
65.1012344
56.1860695
50.8610802
8.1399994
13.3100004
11.6800003
18.2399998
67.6700745
25.0300007
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (500, 400))
offs = [70; 0]
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Nb. observations",
yticks = (offs, ["Train"; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Density")
bdw = .5
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)
f
f = Figure(size = (400, 300))
offs = [.10; 0]
ax = Axis(f[1, 1]; xlabel = "Protein", ylabel = "Density",
yticks = (offs, ["Train"; "Test"]))
bdw = .5
density!(ax, ytrain; bandwidth = bdw, offset = offs[1], color = (:slategray, 0.5))
density!(ax, ytest; bandwidth = bdw, offset = offs[2], color = (:slategray, 0.5))
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = "Protein")
boxplot!(ax, test, y; width = .3, show_notch = true)
f