using Jchemo, JchemoData using JLD2, DataFrames, CairoMakie
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/tecator.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X @head X
... (178, 100)
Row | 850 | 852 | 854 | 856 | 858 | 860 | 862 | 864 | 866 | 868 | 870 | 872 | 874 | 876 | 878 | 880 | 882 | 884 | 886 | 888 | 890 | 892 | 894 | 896 | 898 | 900 | 902 | 904 | 906 | 908 | 910 | 912 | 914 | 916 | 918 | 920 | 922 | 924 | 926 | 928 | 930 | 932 | 934 | 936 | 938 | 940 | 942 | 944 | 946 | 948 | 950 | 952 | 954 | 956 | 958 | 960 | 962 | 964 | 966 | 968 | 970 | 972 | 974 | 976 | 978 | 980 | 982 | 984 | 986 | 988 | 990 | 992 | 994 | 996 | 998 | 1000 | 1002 | 1004 | 1006 | 1008 | 1010 | 1012 | 1014 | 1016 | 1018 | 1020 | 1022 | 1024 | 1026 | 1028 | 1030 | 1032 | 1034 | 1036 | 1038 | 1040 | 1042 | 1044 | 1046 | 1048 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | |
1 | 2.61776 | 2.61814 | 2.61859 | 2.61912 | 2.61981 | 2.62071 | 2.62186 | 2.62334 | 2.62511 | 2.62722 | 2.62964 | 2.63245 | 2.63565 | 2.63933 | 2.64353 | 2.64825 | 2.6535 | 2.65937 | 2.66585 | 2.67281 | 2.68008 | 2.68733 | 2.69427 | 2.70073 | 2.70684 | 2.71281 | 2.71914 | 2.72628 | 2.73462 | 2.74416 | 2.75466 | 2.76568 | 2.77679 | 2.7879 | 2.79949 | 2.81225 | 2.82706 | 2.84356 | 2.86106 | 2.87857 | 2.89497 | 2.90924 | 2.92085 | 2.93015 | 2.93846 | 2.94771 | 2.96019 | 2.97831 | 3.00306 | 3.03506 | 3.07428 | 3.11963 | 3.16868 | 3.21771 | 3.26254 | 3.29988 | 3.32847 | 3.34899 | 3.36342 | 3.37379 | 3.38152 | 3.38741 | 3.39164 | 3.39418 | 3.3949 | 3.39366 | 3.39045 | 3.38541 | 3.37869 | 3.37041 | 3.36073 | 3.34979 | 3.33769 | 3.32443 | 3.31013 | 3.29487 | 3.27891 | 3.26232 | 3.24542 | 3.22828 | 3.2108 | 3.19287 | 3.17433 | 3.15503 | 3.13475 | 3.11339 | 3.09116 | 3.0685 | 3.04596 | 3.02393 | 3.00247 | 2.98145 | 2.96072 | 2.94013 | 2.91978 | 2.89966 | 2.87964 | 2.8596 | 2.8394 | 2.8192 |
2 | 2.83454 | 2.83871 | 2.84283 | 2.84705 | 2.85138 | 2.85587 | 2.8606 | 2.86566 | 2.87093 | 2.87661 | 2.88264 | 2.88898 | 2.89577 | 2.90308 | 2.91097 | 2.91953 | 2.92873 | 2.93863 | 2.94929 | 2.96072 | 2.97272 | 2.98493 | 2.9969 | 3.00833 | 3.0192 | 3.0299 | 3.04101 | 3.05345 | 3.06777 | 3.08416 | 3.10221 | 3.12106 | 3.13983 | 3.1581 | 3.17623 | 3.19519 | 3.21584 | 3.23747 | 3.25889 | 3.27835 | 3.29384 | 3.30362 | 3.30681 | 3.30393 | 3.297 | 3.28925 | 3.28409 | 3.28505 | 3.29326 | 3.30923 | 3.33267 | 3.36251 | 3.39661 | 3.43188 | 3.46492 | 3.49295 | 3.51458 | 3.53004 | 3.54067 | 3.54797 | 3.55306 | 3.55675 | 3.55921 | 3.56045 | 3.56034 | 3.55876 | 3.55571 | 3.55132 | 3.54585 | 3.5395 | 3.53235 | 3.52442 | 3.51583 | 3.50668 | 3.497 | 3.48683 | 3.47626 | 3.46552 | 3.45501 | 3.44481 | 3.43477 | 3.42465 | 3.41419 | 3.40303 | 3.39082 | 3.37731 | 3.36265 | 3.34745 | 3.33245 | 3.31818 | 3.30473 | 3.29186 | 3.27921 | 3.26655 | 3.25369 | 3.24045 | 3.22659 | 3.21181 | 3.196 | 3.17942 |
3 | 2.58284 | 2.58458 | 2.58629 | 2.58808 | 2.58996 | 2.59192 | 2.59401 | 2.59627 | 2.59873 | 2.60131 | 2.60414 | 2.60714 | 2.61029 | 2.61361 | 2.61714 | 2.62089 | 2.62486 | 2.62909 | 2.63361 | 2.63835 | 2.6433 | 2.64838 | 2.65354 | 2.6587 | 2.66375 | 2.6688 | 2.67383 | 2.67892 | 2.68411 | 2.68937 | 2.6947 | 2.70012 | 2.70563 | 2.71141 | 2.71775 | 2.7249 | 2.73344 | 2.74327 | 2.75433 | 2.76642 | 2.77931 | 2.79272 | 2.80649 | 2.82064 | 2.83541 | 2.85121 | 2.86872 | 2.88905 | 2.91289 | 2.94088 | 2.97325 | 3.00946 | 3.0478 | 3.08554 | 3.11947 | 3.14696 | 3.16677 | 3.17938 | 3.18631 | 3.18924 | 3.1895 | 3.18801 | 3.18498 | 3.18039 | 3.17411 | 3.16611 | 3.15641 | 3.14512 | 3.13241 | 3.11843 | 3.10329 | 3.08714 | 3.07014 | 3.05237 | 3.03393 | 3.01504 | 2.99569 | 2.97612 | 2.95642 | 2.9366 | 2.91667 | 2.89655 | 2.87622 | 2.85563 | 2.83474 | 2.81361 | 2.79235 | 2.77113 | 2.75015 | 2.72956 | 2.70934 | 2.68951 | 2.67009 | 2.65112 | 2.63262 | 2.61461 | 2.59718 | 2.58034 | 2.56404 | 2.54816 |
Y = dat.Y @head Y
... (178, 4)
Row | water | fat | protein | typ |
---|---|---|---|---|
Float64 | Float64 | Float64 | String | |
1 | 60.5 | 22.5 | 16.7 | train |
2 | 46.0 | 40.1 | 13.5 | train |
3 | 71.0 | 8.4 | 20.5 | train |
wlst = names(X) wl = parse.(Int, wlst)
100-element Vector{Int64}: 850 852 854 856 858 860 862 864 866 868 ⋮ 1032 1034 1036 1038 1040 1042 1044 1046 1048
plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
Preprocessing by SNV and derivation
model1 = snv() model2 = savgol(npoint = 15, deriv = 2, degree = 3) model = pip(model1, model2) fit!(model, X) Xp = transf(model, X) @head Xp
3×100 Matrix{Float64}: 0.000397076 0.000495203 0.000598623 … 0.00827138 0.00917311 0.00946072 0.00242055 0.00244366 0.00234233 0.00580631 0.00689249 0.00749316 0.0011927 0.00122721 0.00120098 0.0101019 0.0108142 0.0108444 ... (178, 100)
plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f
summ(Y).res
Row | variable | mean | std | min | max | n | nmissing |
---|---|---|---|---|---|---|---|
Symbol | Union… | Union… | Any | Any | Int64 | Int64 | |
1 | water | 63.046 | 9.784 | 39.3 | 76.6 | 178 | 0 |
2 | fat | 18.378 | 12.629 | 0.9 | 49.1 | 178 | 0 |
3 | protein | 17.634 | 2.996 | 11.0 | 21.8 | 178 | 0 |
4 | typ | test | val | 178 | 0 |
typ = Y.typ tab(typ)
OrderedCollections.OrderedDict{String, Int64} with 3 entries: "test" => 31 "train" => 115 "val" => 32
Building a new variable, typ2
, aggregating categories test
and validation
in a new category test
## Training/test (0/1) observations typ2 = ones(Int, nro(typ)) typ2[typ .== "train"] .= 0 tab(typ2)
OrderedCollections.OrderedDict{Int64, Int64} with 2 entries: 0 => 115 1 => 63
summ(Y, typ2)
Class: 0 4×7 DataFrame Row │ variable mean std min max n nmissing │ Symbol Union… Union… Any Any Int64 Int64 ─────┼───────────────────────────────────────────────────────── 1 │ water 63.142 9.846 39.3 76.6 115 0 2 │ fat 18.358 12.605 0.9 49.1 115 0 3 │ protein 17.597 2.989 11.0 21.8 115 0 4 │ typ train train 115 0 Class: 1 4×7 DataFrame Row │ variable mean std min max n nmissing │ Symbol Union… Union… Any Any Int64 Int64 ─────┼─────────────────────────────────────────────────────── 1 │ water 62.87 9.745 40.7 76.1 63 0 2 │ fat 18.414 12.774 1.4 47.8 63 0 3 │ protein 17.702 3.031 11.1 21.6 63 0 4 │ typ test val 63 0
A given variable
namy = names(Y)[1:3] j = 2 nam = namy[2] y = Y[:, nam] s = typ2 .== 0 ytrain = y[s] ytest = rmrow(y, s)
63-element Vector{Float64}: 29.8 1.4 4.6 11.0 17.0 22.4 27.9 46.5 6.1 2.0 ⋮ 18.1 19.4 24.8 27.2 28.4 31.3 33.8 35.5 42.5
f = Figure(size = (400, 300)) ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations") hist!(ax, ytrain; bins = 50, label = "Train") hist!(ax, ytest; bins = 50, label = "Test") axislegend(position = :rt) f
f = Figure(size = (500, 400)) offs = [10; 0] ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations", yticks = (offs, ["Train"; "Test"])) hist!(ax, ytrain; offset = offs[1], bins = 50) hist!(ax, ytest; offset = offs[2], bins = 50) f
f = Figure(size = (400, 300)) ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density") bdw = .5 density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train") density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test") axislegend(position = :rt) f
f = Figure(size = (500, 400)) offs = [.15; 0] ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density", yticks = (offs, ["Train"; "Test"])) bdw = .5 density!(ax, ytrain; bandwidth = bdw, offset = offs[1], color = (:slategray, 0.5)) density!(ax, ytest; bandwidth = bdw, offset = offs[2], color = (:slategray, 0.5)) f
f = Figure(size = (400, 300)) ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = uppercase(nam)) boxplot!(ax, typ2, y; width = .3, show_notch = true) f