Fda - iris

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data", "iris.jld2") 
@load db dat
@names dat

(:X,)

summ(dat.X)

(res = 5×7 DataFrame
 Row │ variable      mean    std     min     max        n      nmissing
     │ Symbol        Union…  Union…  Any     Any        Int64  Int64
─────┼──────────────────────────────────────────────────────────────────
   1 │ sepal_length  5.843   0.828   4.3     7.9          150         0
   2 │ sepal_width   3.057   0.436   2.0     4.4          150         0
   3 │ petal_length  3.758   1.765   1.0     6.9          150         0
   4 │ petal_width   1.199   0.762   0.1     2.5          150         0
   5 │ species                       setosa  virginica    150         0, ntot = 150)

@head X = dat.X[:, 1:4]

... (150, 4)

3×4 DataFrame

Row	sepal_length	sepal_width	petal_length	petal_width
	Float64	Float64	Float64	Float64
1	5.1	3.5	1.4	0.2
2	4.9	3.0	1.4	0.2
3	4.7	3.2	1.3	0.2

@head y = dat.X[:, 5]    # the classes (species)

3-element Vector{String}:
 "setosa"
 "setosa"
 "setosa"
... (150,)

tab(y)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 50
  "versicolor" => 50
  "virginica"  => 50

lev = mlev(y)
nlev = length(lev)

Split Tot to Train/Test

ntot = nro(X)
ntest = 30
s = samprand(ntot, ntest)
Xtrain = X[s.train, :]
ytrain = y[s.train]
Xtest = X[s.test, :]
ytest = y[s.test]
ntrain = ntot - ntest
(ntot = ntot, ntrain, ntest)

(ntot = 150, ntrain = 120, ntest = 30)

tab(ytrain)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 41
  "versicolor" => 41
  "virginica"  => 38

tab(ytest)

OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 9
  "versicolor" => 9
  "virginica"  => 12

Model fitting

model = fda(nlv = 2)
#model = fdasvd(nlv = 2)     # alternative algorithm (same result)
fit!(model, Xtrain, ytrain) 
fitm = model.fitm
@names fitm

(:T, :V, :Tcenters, :eig, :sstot, :W, :xmeans, :xscales, :weights, :lev, :ni, :par)

lev = fitm.lev
nlev = length(lev)

Fda scores

@head Ttrain = fitm.T

3×2 Matrix{Float64}:
 -7.67227   0.280642
 -6.72478  -0.799262
 -7.09482  -0.198632
... (120, 2)

Class centers projected on the score space

ct = fitm.Tcenters

3×2 Matrix{Float64}:
 -7.14852   0.233595
  1.92642  -0.805302
  5.63437   0.616841

f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f

color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f

X-loadings matrix

Columns of matrix V are coefficients of the linear discriminant function, corresponding to object LD of function lda of the R package MASS.

V = fitm.V

4×2 Matrix{Float64}:
 -0.788234  -0.263803
 -1.57967    2.26533
  2.11753   -0.948042
  2.68483    3.15521

V' * V    # not orthogonal

2×2 Matrix{Float64}:
 14.8089    3.09316
  3.09316  16.0554

Explained variance

fitm.eig

4-element Vector{Float64}:
 28.780516708141555
  0.36070779568823824
  4.454486492572078e-15
 -7.97918739539233e-16

fitm.sstot

29.141224503829797

Explained variance is computed from the Pca of the class centers in the transformed scale.

summary(model)

(explvarx = 2×4 DataFrame
 Row │ nlv    var        pvar       cumpvar
     │ Int64  Float64    Float64    Float64
─────┼───────────────────────────────────────
   1 │     1  28.7805    0.987622   0.987622
   2 │     2   0.360708  0.0123779  1.0,)

Projection of Xtest to the score space

@head Ttest = transf(model, Xtest)

3×2 Matrix{Float64}:
 -6.93948  -0.983054
 -6.91444  -1.0884
 -8.81819   2.69739
... (30, 2)

i = 1  # class "i" in test
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, 
    title = string("Projection test-class ", lev[i], " (blue points)"), ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
s = ytest .== lev[i]
zT = Ttest[s, :]
scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue)
f