cassav - Data description

using Jchemo, JchemoData
using JLD2, DataFrames, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/cassav.jld2")
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (280, 1050)
3×1050 DataFrame
950 columns omitted
Row400402404406408410412414416418420422424426428430432434436438440442444446448450452454456458460462464466468470472474476478480482484486488490492494496498500502504506508510512514516518520522524526528530532534536538540542544546548550552554556558560562564566568570572574576578580582584586588590592594596598
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.3999960.4065220.4130080.419580.4260730.432190.4380070.4439490.449950.4550810.459340.4635350.4676630.47110.4736880.4758090.4775850.4791750.4806020.4817080.4826130.4834570.4842220.4848730.4853590.4857290.4861140.4864690.4864780.485990.4852580.4841750.4825940.4810780.480040.4789110.4773380.4757690.4743130.4724190.4702430.4682270.4657270.4617790.4562780.449930.4424130.4330930.4225630.4112680.3994690.3871540.3741090.3603910.3462330.3318320.3171790.3026230.2883420.2740520.2603250.2479760.2368270.2263260.2168170.2086730.2016580.1953950.1898230.1850990.1810550.1775440.1744850.1717480.1693480.1673290.1654110.1631880.1610340.1596110.1584240.156920.1552450.1535790.1518260.1501280.1485040.146870.1454720.1443960.1431970.1418240.1406150.139590.1386680.1378750.1371490.1364320.1357210.134834
20.4608960.467060.4756770.4834380.4908090.498770.5064650.5132680.5194210.5251450.5307440.5361440.5409570.5453280.5493670.5527270.5553330.5577080.5601850.5623570.5638940.5649090.5652770.5651880.5650420.5648770.5643550.5636490.5632570.5630120.5623940.5613790.5603410.5593980.5580060.5557780.5531710.5508120.5486780.5466160.5445790.5421490.5384480.5338930.5290180.5234220.5166820.5091480.5010180.4914440.4805270.4690450.4568730.4439330.4304790.4167390.4024840.3875220.3728580.3595460.3472540.3349750.3227340.3114260.3011260.2916580.2829980.274830.2669870.2595670.252440.2453520.2383490.231530.2246450.2177890.2110070.2040990.1973680.1913490.1858420.1803740.174980.1699710.1654020.1610850.1569950.1532940.1501040.1474430.1450520.1428270.1407070.1386630.1367620.135060.1335520.1321910.1309640.129819
30.4647310.4714160.478280.487330.4971170.5030040.5055790.5093160.5146490.5193170.5231920.5273090.5314340.5347480.5370760.5390330.5408110.5418830.5421830.542320.5427440.5432930.5436750.543720.5435890.5438680.5444870.5446480.5442390.5436420.5429870.5421220.5410040.5398530.5389040.5378620.5361590.5340210.5321380.5300540.5273270.5244620.5213290.517030.5112950.5045190.4965510.4872210.4768650.4649690.4517860.4382250.4246160.410580.3958090.3805950.3652250.3497050.3342530.3193780.3052040.2912550.2776190.2652140.254450.2449070.2362550.2284980.2217740.2160810.2109530.2062540.2022260.1987690.1955770.1925920.1899740.1876270.1853560.1833140.1814770.1792740.1765020.1740010.1724650.1710990.1692750.1672750.1655310.1640220.1625290.1611490.1600610.1591140.1579310.1565770.1553410.1542780.1532730.152306
Y = dat.Y
@head Y
... (280, 2)
3×2 DataFrame
Rowyeartbc
Int64Float64
120091.58068
220097.85516
320091.77595

X data

wlst = names(X) 
wl = parse.(Int, wlst)
1050-element Vector{Int64}:
  400
  402
  404
  406
  408
  410
  412
  414
  416
  418
    ⋮
 2482
 2484
 2486
 2488
 2490
 2492
 2494
 2496
 2498
plotsp(X, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Preprocessing by SNV and derivation

model1 = snv()
model2 = savgol(npoint = 15, deriv = 2, degree = 3)
model = pip(model1, model2)
fit!(model, X)
Xp = transf(model, X)
@head Xp
3×1050 Matrix{Float64}:
 0.00159646  0.00150837  0.00128809  …  -0.000579388  -0.000607364
 0.00200113  0.00188047  0.0015905      -0.000433115  -0.000371824
 0.00166172  0.00141834  0.00106673     -0.000503714  -0.000469987
... (280, 1050)
plotsp(Xp, wl; xlabel = "Wavelength (nm)", ylabel = "Absorbance").f

Y data

summ(Y).res
2×7 DataFrame
Rowvariablemeanstdminmaxnnmissing
SymbolFloat64Float64RealRealInt64Int64
1year2011.361.432009.020132800
2tbc6.4363.5030.00115.44252800
year = Y.year
tab(year)
OrderedCollections.OrderedDict{Int64, Int64} with 5 entries:
  2009 => 42
  2010 => 47
  2011 => 40
  2012 => 71
  2013 => 80
y = Y.tbc
summ(y).res
1×7 DataFrame
Rowvariablemeanstdminmaxnnmissing
SymbolFloat64Float64Float64Float64Int64Int64
1x16.4363.5030.00115.44252800
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "TBC", ylabel = "Nb. observations")
hist!(ax, y; bins = 30, label = "Train")
f
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "TBC", ylabel = "Density")
density!(ax, y; bandwidth = .2, color = (:red, .5))
f
y = Y.tbc
summ(y, year)
Class: 2009
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1          4.617    2.214    1.581  9.42472     42         0


Class: 2010
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1           6.19    2.896    0.001  12.0709     47         0


Class: 2011
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1            4.7    3.649    0.035  11.3414     40         0


Class: 2012
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1          6.331    2.841    0.203  12.2828     71         0


Class: 2013
1×7 DataFrame
 Row │ variable  mean     std      min      max      n      nmissing
     │ Symbol    Float64  Float64  Float64  Float64  Int64  Int64
─────┼───────────────────────────────────────────────────────────────
   1 │ x1          8.496    3.816    0.776  15.4425     80         0
f = Figure(size = (400, 300))
ax = Axis(f[1, 1]; xlabel = "Year", ylabel = "TBC")
boxplot!(ax, year, y; width = .7, show_notch = true)
f