tecator - Data description

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/tecator.jld2") 
@load db dat
@names dat

(:X, :Y)

X = dat.X
@head X

... (178, 100)

3×100 DataFrame

Row	850	852	854	856	858	860	862	864	866	868	870	872	874	876	878	880	882	884	886	888	890	892	894	896	898	900	902	904	906	908	910	912	914	916	918	920	922	924	926	928	930	932	934	936	938	940	942	944	946	948	950	952	954	956	958	960	962	964	966	968	970	972	974	976	978	980	982	984	986	988	990	992	994	996	998	1000	1002	1004	1006	1008	1010	1012	1014	1016	1018	1020	1022	1024	1026	1028	1030	1032	1034	1036	1038	1040	1042	1044	1046	1048
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	2.61776	2.61814	2.61859	2.61912	2.61981	2.62071	2.62186	2.62334	2.62511	2.62722	2.62964	2.63245	2.63565	2.63933	2.64353	2.64825	2.6535	2.65937	2.66585	2.67281	2.68008	2.68733	2.69427	2.70073	2.70684	2.71281	2.71914	2.72628	2.73462	2.74416	2.75466	2.76568	2.77679	2.7879	2.79949	2.81225	2.82706	2.84356	2.86106	2.87857	2.89497	2.90924	2.92085	2.93015	2.93846	2.94771	2.96019	2.97831	3.00306	3.03506	3.07428	3.11963	3.16868	3.21771	3.26254	3.29988	3.32847	3.34899	3.36342	3.37379	3.38152	3.38741	3.39164	3.39418	3.3949	3.39366	3.39045	3.38541	3.37869	3.37041	3.36073	3.34979	3.33769	3.32443	3.31013	3.29487	3.27891	3.26232	3.24542	3.22828	3.2108	3.19287	3.17433	3.15503	3.13475	3.11339	3.09116	3.0685	3.04596	3.02393	3.00247	2.98145	2.96072	2.94013	2.91978	2.89966	2.87964	2.8596	2.8394	2.8192
2	2.83454	2.83871	2.84283	2.84705	2.85138	2.85587	2.8606	2.86566	2.87093	2.87661	2.88264	2.88898	2.89577	2.90308	2.91097	2.91953	2.92873	2.93863	2.94929	2.96072	2.97272	2.98493	2.9969	3.00833	3.0192	3.0299	3.04101	3.05345	3.06777	3.08416	3.10221	3.12106	3.13983	3.1581	3.17623	3.19519	3.21584	3.23747	3.25889	3.27835	3.29384	3.30362	3.30681	3.30393	3.297	3.28925	3.28409	3.28505	3.29326	3.30923	3.33267	3.36251	3.39661	3.43188	3.46492	3.49295	3.51458	3.53004	3.54067	3.54797	3.55306	3.55675	3.55921	3.56045	3.56034	3.55876	3.55571	3.55132	3.54585	3.5395	3.53235	3.52442	3.51583	3.50668	3.497	3.48683	3.47626	3.46552	3.45501	3.44481	3.43477	3.42465	3.41419	3.40303	3.39082	3.37731	3.36265	3.34745	3.33245	3.31818	3.30473	3.29186	3.27921	3.26655	3.25369	3.24045	3.22659	3.21181	3.196	3.17942
3	2.58284	2.58458	2.58629	2.58808	2.58996	2.59192	2.59401	2.59627	2.59873	2.60131	2.60414	2.60714	2.61029	2.61361	2.61714	2.62089	2.62486	2.62909	2.63361	2.63835	2.6433	2.64838	2.65354	2.6587	2.66375	2.6688	2.67383	2.67892	2.68411	2.68937	2.6947	2.70012	2.70563	2.71141	2.71775	2.7249	2.73344	2.74327	2.75433	2.76642	2.77931	2.79272	2.80649	2.82064	2.83541	2.85121	2.86872	2.88905	2.91289	2.94088	2.97325	3.00946	3.0478	3.08554	3.11947	3.14696	3.16677	3.17938	3.18631	3.18924	3.1895	3.18801	3.18498	3.18039	3.17411	3.16611	3.15641	3.14512	3.13241	3.11843	3.10329	3.08714	3.07014	3.05237	3.03393	3.01504	2.99569	2.97612	2.95642	2.9366	2.91667	2.89655	2.87622	2.85563	2.83474	2.81361	2.79235	2.77113	2.75015	2.72956	2.70934	2.68951	2.67009	2.65112	2.63262	2.61461	2.59718	2.58034	2.56404	2.54816

Y = dat.Y
@head Y

... (178, 4)

3×4 DataFrame

Row	water	fat	protein	typ
	Float64	Float64	Float64	String
1	60.5	22.5	16.7	train
2	46.0	40.1	13.5	train
3	71.0	8.4	20.5	train

X data

wlst = names(X) 
wl = parse.(Int, wlst)

100-element Vector{Int64}:
  850
  852
  854
  856
  858
  860
  862
  864
  866
  868
    ⋮
 1032
 1034
 1036
 1038
 1040
 1042
 1044
 1046
 1048

plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Preprocessing by SNV and derivation

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp

3×100 Matrix{Float64}:
 0.000397076  0.000495203  0.000598623  …  0.00827138  0.00917311  0.00946072
 0.00242055   0.00244366   0.00234233      0.00580631  0.00689249  0.00749316
 0.0011927    0.00122721   0.00120098      0.0101019   0.0108142   0.0108444
... (178, 100)

plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Y data

summ(Y).res

4×7 DataFrame

Row	variable	mean	std	min	max	n	nmissing
	Symbol	Union…	Union…	Any	Any	Int64	Int64
1	water	63.046	9.784	39.3	76.6	178	0
2	fat	18.378	12.629	0.9	49.1	178	0
3	protein	17.634	2.996	11.0	21.8	178	0
4	typ			test	val	178	0

typ = Y.typ
tab(typ)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "test"  => 31
  "train" => 115
  "val"   => 32

Building a new variable, typ2, aggregating categories test and validation in a new category test

## Training/test (0/1) observations
typ2 = ones(Int, nro(typ))
typ2[typ .== "train"] .= 0
tab(typ2)

OrderedCollections.OrderedDict{Int64, Int64} with 2 entries:
  0 => 115
  1 => 63

summ(Y, typ2)

Class: 0
4×7 DataFrame
 Row │ variable  mean    std     min    max    n      nmissing
     │ Symbol    Union…  Union…  Any    Any    Int64  Int64
─────┼─────────────────────────────────────────────────────────
   1 │ water     63.142  9.846   39.3   76.6     115         0
   2 │ fat       18.358  12.605  0.9    49.1     115         0
   3 │ protein   17.597  2.989   11.0   21.8     115         0
   4 │ typ                       train  train    115         0


Class: 1
4×7 DataFrame
 Row │ variable  mean    std     min   max   n      nmissing
     │ Symbol    Union…  Union…  Any   Any   Int64  Int64
─────┼───────────────────────────────────────────────────────
   1 │ water     62.87   9.745   40.7  76.1     63         0
   2 │ fat       18.414  12.774  1.4   47.8     63         0
   3 │ protein   17.702  3.031   11.1  21.6     63         0
   4 │ typ                       test  val      63         0

A given variable

namy = names(Y)[1:3]
j = 2
nam = namy[2]
y = Y[:, nam]
s = typ2 .== 0
ytrain = y[s] 
ytest = rmrow(y, s)

63-element Vector{Float64}:
 29.8
  1.4
  4.6
 11.0
 17.0
 22.4
 27.9
 46.5
  6.1
  2.0
  ⋮
 18.1
 19.4
 24.8
 27.2
 28.4
 31.3
 33.8
 35.5
 42.5

f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Nb. observations")
hist!(ax, ytrain; bins = 50, label = "Train")
hist!(ax, ytest; bins = 50, label = "Test")
axislegend(position = :rt)
f

f = Figure(size = (500, 400))
offs = [10; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam),  ylabel = "Nb. observations", 
    yticks = (offs, ["Train"; "Test"]))
hist!(ax, ytrain; offset = offs[1], bins = 50)
hist!(ax, ytest; offset = offs[2], bins = 50)
f

f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density")
bdw = .5 
density!(ax, ytrain; bandwidth = bdw, color = :blue, label = "Train")
density!(ax, ytest; bandwidth = bdw, color = (:red, .5), label = "Test")
axislegend(position = :rt)  
f

f = Figure(size = (500, 400))
offs = [.15; 0]
ax = Axis(f[1, 1]; xlabel = uppercase(nam), ylabel = "Density", 
    yticks = (offs, ["Train"; "Test"]))
bdw = .5
density!(ax, ytrain; bandwidth = bdw, offset = offs[1], color = (:slategray, 0.5))
density!(ax, ytest; bandwidth = bdw, offset = offs[2], color = (:slategray, 0.5))
f

f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xticks = (0:1, ["Train", "Test"]), xlabel = "Group", ylabel = uppercase(nam))
boxplot!(ax, typ2, y; width = .3, show_notch = true)
f