using Jchemo, JchemoData using JLD2, CairoMakie
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data", "iris.jld2") @load db dat @names dat
(:X,)
summ(dat.X)
(res = 5×7 DataFrame
Row │ variable mean std min max n nmissing
│ Symbol Union… Union… Any Any Int64 Int64
─────┼──────────────────────────────────────────────────────────────────
1 │ sepal_length 5.843 0.828 4.3 7.9 150 0
2 │ sepal_width 3.057 0.436 2.0 4.4 150 0
3 │ petal_length 3.758 1.765 1.0 6.9 150 0
4 │ petal_width 1.199 0.762 0.1 2.5 150 0
5 │ species setosa virginica 150 0, ntot = 150)
@head X = dat.X[:, 1:4]
... (150, 4)
| Row | sepal_length | sepal_width | petal_length | petal_width |
|---|---|---|---|---|
| Float64 | Float64 | Float64 | Float64 | |
| 1 | 5.1 | 3.5 | 1.4 | 0.2 |
| 2 | 4.9 | 3.0 | 1.4 | 0.2 |
| 3 | 4.7 | 3.2 | 1.3 | 0.2 |
@head y = dat.X[:, 5] # the classes (species)
3-element Vector{String}:
"setosa"
"setosa"
"setosa"
... (150,)
tab(y)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"setosa" => 50
"versicolor" => 50
"virginica" => 50
lev = mlev(y) nlev = length(lev)
3
ntot = nro(X) ntest = 30 s = samprand(ntot, ntest) Xtrain = X[s.train, :] ytrain = y[s.train] Xtest = X[s.test, :] ytest = y[s.test] ntrain = ntot - ntest (ntot = ntot, ntrain, ntest)
(ntot = 150, ntrain = 120, ntest = 30)
tab(ytrain)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"setosa" => 41
"versicolor" => 39
"virginica" => 40
tab(ytest)
OrderedCollections.OrderedDict{String, Int64} with 3 entries:
"setosa" => 9
"versicolor" => 11
"virginica" => 10
model = fda(nlv = 2) #model = fdasvd(nlv = 2) # alternative algorithm (same result) fit!(model, Xtrain, ytrain) fitm = model.fitm @names fitm
(:T, :V, :Tcenters, :eig, :sstot, :W, :xmeans, :xscales, :weights, :lev, :ni, :par)
lev = fitm.lev nlev = length(lev)
3
Fda scores
@head Ttrain = fitm.T
3×2 Matrix{Float64}:
-8.30934 -0.3944
-7.08443 0.686112
-6.93956 0.71046
... (120, 2)
Class centers projected on the score space
ct = fitm.Tcenters
3×2 Matrix{Float64}:
-7.76146 -0.216289
2.06356 0.803163
5.94353 -0.561388
f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) f
color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = "Fda", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) f
Columns of matrix V are coefficients of the linear discriminant function, corresponding to object LD of function lda of the R package MASS.
V = fitm.V
4×2 Matrix{Float64}:
-0.55404 -0.303984
-2.22819 -2.03943
2.0148 1.37096
3.09661 -3.5161
V' * V # not orthogonal
2×2 Matrix{Float64}:
18.9202 -3.41314
-3.41314 18.4942
fitm.eig
4-element Vector{Float64}:
33.74117544110285
0.3306833252553966
-2.0844676198533647e-15
-2.0844676198533647e-15
fitm.sstot
34.07185876635825
Explained variance is computed from the Pca of the class centers in the transformed scale.
summary(model)
(explvarx = 2×4 DataFrame
Row │ nlv var pvar cumpvar
│ Int64 Float64 Float64 Float64
─────┼────────────────────────────────────────
1 │ 1 33.7412 0.990295 0.990295
2 │ 2 0.330683 0.00970547 1.0,)
@head Ttest = transf(model, Xtest)
3×2 Matrix{Float64}:
-7.62074 0.201927
-9.82633 -2.9784
-8.94898 -2.14168
... (30, 2)
i = 1 # class "i" in test color = cgrad(:lightrainbow, nlev; categorical = true, alpha = .7) f, ax = plotxy(Ttrain[:, 1], Ttrain[:, 2], ytrain; color, title = title = "Projection test-class $(lev[i]) (blue points)", ellipse = true) scatter!(ax, ct[:, 1], ct[:, 2]; markersize = 10, color = :red) s = ytest .== lev[i] zT = Ttest[s, :] scatter!(ax, zT[:, 1], zT[:, 2]; markersize = 10, color = :blue) f