Fda - iris

using Jchemo, JchemoData
using JLD2, CairoMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data", "iris.jld2") 
@load db dat
@names dat
(:X,)
summ(dat.X)
(res = 5×7 DataFrame
 Row │ variable      mean    std     min     max        n      nmissing
     │ Symbol        Union…  Union…  Any     Any        Int64  Int64
─────┼──────────────────────────────────────────────────────────────────
   1 │ sepal_length  5.843   0.828   4.3     7.9          150         0
   2 │ sepal_width   3.057   0.436   2.0     4.4          150         0
   3 │ petal_length  3.758   1.765   1.0     6.9          150         0
   4 │ petal_width   1.199   0.762   0.1     2.5          150         0
   5 │ species                       setosa  virginica    150         0, ntot = 150)
@head X = dat.X[:, 1:4]
... (150, 4)
3×4 DataFrame
Rowsepal_lengthsepal_widthpetal_lengthpetal_width
Float64Float64Float64Float64
15.13.51.40.2
24.93.01.40.2
34.73.21.30.2
@head y = dat.X[:, 5]    # the classes (species)
3-element Vector{String}:
 "setosa"
 "setosa"
 "setosa"
... (150,)
tab(y)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 50
  "versicolor" => 50
  "virginica"  => 50
lev = mlev(y)
nlev = length(lev)
3

Split Tot to Train/Test

ntot = nro(X)
ntest = 30
s = samprand(ntot, ntest)
Xtrain = X[s.train, :]
ytrain = y[s.train]
Xtest = X[s.test, :]
ytest = y[s.test]
ntrain = ntot - ntest
(ntot = ntot, ntrain, ntest)
(ntot = 150, ntrain = 120, ntest = 30)
tab(ytrain)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 41
  "versicolor" => 39
  "virginica"  => 40
tab(ytest)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
  "setosa"     => 9
  "versicolor" => 11
  "virginica"  => 10

Model fitting

model = fda(nlv = 2)
#model = fdasvd(nlv = 2)     # alternative algorithm (same result)
fit!(model, Xtrain, ytrain) 
fitm = model.fitm
@names fitm
(:T, :V, :Tcenters, :eig, :sstot, :W, :xmeans, :xscales, :weights, :lev, :ni, :par)
lev = fitm.lev
nlev = length(lev)
3

Fda scores

@head Ttrain = fitm.T
3×2 Matrix{Float64}:
 -8.30934  -0.3944
 -7.08443   0.686112
 -6.93956   0.71046
... (120, 2)

Class centers projected on the score space

ct = fitm.Tcenters
3×2 Matrix{Float64}:
 -7.76146  -0.216289
  2.06356   0.803163
  5.94353  -0.561388
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
f

X-loadings matrix

Columns of matrix V are coefficients of the linear discriminant function, corresponding to object LD of function lda of the R package MASS.

V = fitm.V
4×2 Matrix{Float64}:
 -0.55404  -0.303984
 -2.22819  -2.03943
  2.0148    1.37096
  3.09661  -3.5161
V' * V    # not orthogonal
2×2 Matrix{Float64}:
 18.9202   -3.41314
 -3.41314  18.4942

Explained variance

fitm.eig
4-element Vector{Float64}:
 33.74117544110285
  0.3306833252553966
 -2.0844676198533647e-15
 -2.0844676198533647e-15
fitm.sstot
34.07185876635825

Explained variance is computed from the Pca of the class centers in the transformed scale.

summary(model)
(explvarx = 2×4 DataFrame
 Row │ nlv    var        pvar        cumpvar
     │ Int64  Float64    Float64     Float64
─────┼────────────────────────────────────────
   1 │     1  33.7412    0.990295    0.990295
   2 │     2   0.330683  0.00970547  1.0,)

Projection of Xtest to the score space

@head Ttest = transf(model, Xtest)
3×2 Matrix{Float64}:
 -7.62074   0.201927
 -9.82633  -2.9784
 -8.94898  -2.14168
... (30, 2)
i = 1  # class "i" in test
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7)
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = title = "Projection test-class $(lev[i]) (blue points)", 
    ellipse = true)
scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red)
s = ytest .== lev[i]
zT = Ttest[s, :]
scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue)
f