Pca - cassav - train vs test

using Jchemo, JchemoData
using JLD2, CairoMakie, GLMakie

Data importation

path_jdat = dirname(dirname(pathof(JchemoData)))
db = joinpath(path_jdat, "data/cassav.jld2")
@load db dat
@names dat
(:X, :Y)
X = dat.X
@head X
... (280, 1050)
3×1050 DataFrame
950 columns omitted
Row400402404406408410412414416418420422424426428430432434436438440442444446448450452454456458460462464466468470472474476478480482484486488490492494496498500502504506508510512514516518520522524526528530532534536538540542544546548550552554556558560562564566568570572574576578580582584586588590592594596598
Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
10.3999960.4065220.4130080.419580.4260730.432190.4380070.4439490.449950.4550810.459340.4635350.4676630.47110.4736880.4758090.4775850.4791750.4806020.4817080.4826130.4834570.4842220.4848730.4853590.4857290.4861140.4864690.4864780.485990.4852580.4841750.4825940.4810780.480040.4789110.4773380.4757690.4743130.4724190.4702430.4682270.4657270.4617790.4562780.449930.4424130.4330930.4225630.4112680.3994690.3871540.3741090.3603910.3462330.3318320.3171790.3026230.2883420.2740520.2603250.2479760.2368270.2263260.2168170.2086730.2016580.1953950.1898230.1850990.1810550.1775440.1744850.1717480.1693480.1673290.1654110.1631880.1610340.1596110.1584240.156920.1552450.1535790.1518260.1501280.1485040.146870.1454720.1443960.1431970.1418240.1406150.139590.1386680.1378750.1371490.1364320.1357210.134834
20.4608960.467060.4756770.4834380.4908090.498770.5064650.5132680.5194210.5251450.5307440.5361440.5409570.5453280.5493670.5527270.5553330.5577080.5601850.5623570.5638940.5649090.5652770.5651880.5650420.5648770.5643550.5636490.5632570.5630120.5623940.5613790.5603410.5593980.5580060.5557780.5531710.5508120.5486780.5466160.5445790.5421490.5384480.5338930.5290180.5234220.5166820.5091480.5010180.4914440.4805270.4690450.4568730.4439330.4304790.4167390.4024840.3875220.3728580.3595460.3472540.3349750.3227340.3114260.3011260.2916580.2829980.274830.2669870.2595670.252440.2453520.2383490.231530.2246450.2177890.2110070.2040990.1973680.1913490.1858420.1803740.174980.1699710.1654020.1610850.1569950.1532940.1501040.1474430.1450520.1428270.1407070.1386630.1367620.135060.1335520.1321910.1309640.129819
30.4647310.4714160.478280.487330.4971170.5030040.5055790.5093160.5146490.5193170.5231920.5273090.5314340.5347480.5370760.5390330.5408110.5418830.5421830.542320.5427440.5432930.5436750.543720.5435890.5438680.5444870.5446480.5442390.5436420.5429870.5421220.5410040.5398530.5389040.5378620.5361590.5340210.5321380.5300540.5273270.5244620.5213290.517030.5112950.5045190.4965510.4872210.4768650.4649690.4517860.4382250.4246160.410580.3958090.3805950.3652250.3497050.3342530.3193780.3052040.2912550.2776190.2652140.254450.2449070.2362550.2284980.2217740.2160810.2109530.2062540.2022260.1987690.1955770.1925920.1899740.1876270.1853560.1833140.1814770.1792740.1765020.1740010.1724650.1710990.1692750.1672750.1655310.1640220.1625290.1611490.1600610.1591140.1579310.1565770.1553410.1542780.1532730.152306
Y = dat.Y
@head Y
... (280, 2)
3×2 DataFrame
Rowyeartbc
Int64Float64
120091.58068
220097.85516
320091.77595
year = Y.year 
tab(year)
OrderedCollections.OrderedDict{Int64, Int64} with 5 entries:
  2009 => 42
  2010 => 47
  2011 => 40
  2012 => 71
  2013 => 80
s = year .<= 2012
Xtrain = X[s, :] 
Xtest = rmrow(X, s) 
ntot = nro(X) 
ntrain = nro(Xtrain) 
ntest = nro(Xtest)
(ntot = ntot, ntrain, ntest)
(ntot = 280, ntrain = 200, ntest = 80)

Model fitting and scores T

model = pcasvd(nlv = 10) 
fit!(model, Xtrain)  
fitm = model.fitm 
@head Ttrain = fitm.T
3×10 Matrix{Float64}:
 -1.49132  -0.935752  -0.0893801  …  -0.00890547  -0.0121735  0.0475741
 -1.92027  -1.22367    0.574574       0.0308477    0.0256603  0.0107245
 -1.32825  -1.06403    0.355515       0.0604713    0.0221343  0.0217253
... (200, 10)
@head Ttest = transf(model, Xtest)
3×10 Matrix{Float64}:
  0.609136   1.11612   -0.219233   0.00860998  …   0.00491884  -0.010557
 -0.0770536  1.16518    0.142179  -0.147944       -0.0178558   -0.00200614
  3.88997    0.875433  -0.682212   0.0649359      -0.110588     0.0311519
... (80, 10)

2-D score spaces

CairoMakie.activate!()
T = vcat(Ttrain, Ttest)  
group = [repeat(["Train"], ntrain); repeat(["Test"], ntest)]
plotxy(T[:, 1], T[:, 2], group; zeros = true, xlabel = "PC1", ylabel = "PC2").f
color = [(:red, .5), (:blue, .5)]
plotlv(T[:, 1:6], group; size = (750, 400), shape = (2, 3), color = color, zeros = true, xlabel = "PC", ylabel = "PC").f

3-D score spaces

CairoMakie.activate!()  
#GLMakie.activate!()    # for interactive axe-rotation

i = 1
size = (600, 350)
plotxyz(T[:, i], T[:, i + 1], T[:, i + 2], group; size, color, markersize = 10, 
    xlabel = string("PC", i), ylabel = string("PC", i + 1), zlabel = string("PC", i + 2), 
    title = "Pca score space").f