using Jchemo, JchemoData using JLD2, DataFrames, CairoMakie, GLMakie using FreqTables
path_jdat = dirname(dirname(pathof(JchemoData))) db = joinpath(path_jdat, "data/challenge2018.jld2") @load db dat @names dat
(:X, :Y)
X = dat.X @head X
... (4075, 680)
Row | 1120 | 1122 | 1124 | 1126 | 1128 | 1130 | 1132 | 1134 | 1136 | 1138 | 1140 | 1142 | 1144 | 1146 | 1148 | 1150 | 1152 | 1154 | 1156 | 1158 | 1160 | 1162 | 1164 | 1166 | 1168 | 1170 | 1172 | 1174 | 1176 | 1178 | 1180 | 1182 | 1184 | 1186 | 1188 | 1190 | 1192 | 1194 | 1196 | 1198 | 1200 | 1202 | 1204 | 1206 | 1208 | 1210 | 1212 | 1214 | 1216 | 1218 | 1220 | 1222 | 1224 | 1226 | 1228 | 1230 | 1232 | 1234 | 1236 | 1238 | 1240 | 1242 | 1244 | 1246 | 1248 | 1250 | 1252 | 1254 | 1256 | 1258 | 1260 | 1262 | 1264 | 1266 | 1268 | 1270 | 1272 | 1274 | 1276 | 1278 | 1280 | 1282 | 1284 | 1286 | 1288 | 1290 | 1292 | 1294 | 1296 | 1298 | 1300 | 1302 | 1304 | 1306 | 1308 | 1310 | 1312 | 1314 | 1316 | 1318 | ⋯ |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | Float64 | ⋯ | |
1 | 0.597482 | 0.595978 | 0.593623 | 0.59084 | 0.587451 | 0.583092 | 0.578666 | 0.572134 | 0.566125 | 0.560204 | 0.551694 | 0.544339 | 0.537008 | 0.528312 | 0.520828 | 0.51228 | 0.504673 | 0.498204 | 0.48968 | 0.483176 | 0.477588 | 0.470384 | 0.464844 | 0.459269 | 0.454023 | 0.4496 | 0.444271 | 0.440026 | 0.436514 | 0.431822 | 0.428201 | 0.424881 | 0.421379 | 0.41859 | 0.415556 | 0.413193 | 0.411501 | 0.40946 | 0.408254 | 0.407503 | 0.406948 | 0.406903 | 0.407211 | 0.407941 | 0.408835 | 0.410495 | 0.412211 | 0.41412 | 0.416691 | 0.419178 | 0.421902 | 0.424632 | 0.427036 | 0.43022 | 0.432809 | 0.434996 | 0.437823 | 0.439963 | 0.442041 | 0.443982 | 0.445578 | 0.447395 | 0.448785 | 0.449829 | 0.451165 | 0.452131 | 0.453035 | 0.453898 | 0.454573 | 0.455325 | 0.455932 | 0.456439 | 0.45706 | 0.457588 | 0.458121 | 0.458712 | 0.459253 | 0.459968 | 0.460628 | 0.461245 | 0.462104 | 0.462849 | 0.463604 | 0.464367 | 0.464976 | 0.465614 | 0.466088 | 0.466315 | 0.466568 | 0.466553 | 0.466394 | 0.465981 | 0.465464 | 0.464522 | 0.463511 | 0.462348 | 0.460648 | 0.458807 | 0.456757 | 0.454314 | ⋯ |
2 | 0.954192 | 0.953237 | 0.952002 | 0.950426 | 0.948382 | 0.946138 | 0.943543 | 0.94057 | 0.937514 | 0.934336 | 0.931181 | 0.92803 | 0.924979 | 0.922076 | 0.919204 | 0.916252 | 0.913228 | 0.91006 | 0.906591 | 0.902934 | 0.899069 | 0.894974 | 0.890786 | 0.886513 | 0.882297 | 0.878178 | 0.874064 | 0.870132 | 0.866404 | 0.862724 | 0.859357 | 0.856211 | 0.853359 | 0.850962 | 0.848901 | 0.84742 | 0.846361 | 0.845615 | 0.845178 | 0.845054 | 0.845031 | 0.845245 | 0.845549 | 0.845977 | 0.846463 | 0.847118 | 0.847808 | 0.848598 | 0.849367 | 0.850203 | 0.851115 | 0.85195 | 0.852774 | 0.853623 | 0.854617 | 0.855511 | 0.856426 | 0.857282 | 0.858228 | 0.859145 | 0.860036 | 0.860869 | 0.861774 | 0.862626 | 0.863431 | 0.864185 | 0.864786 | 0.865254 | 0.865612 | 0.865936 | 0.866131 | 0.866294 | 0.8663 | 0.866269 | 0.86628 | 0.866326 | 0.866443 | 0.866614 | 0.866967 | 0.867366 | 0.867852 | 0.868533 | 0.869212 | 0.869894 | 0.870584 | 0.871296 | 0.872008 | 0.872612 | 0.873183 | 0.873633 | 0.874032 | 0.874337 | 0.87455 | 0.87461 | 0.874656 | 0.874554 | 0.87432 | 0.873982 | 0.873513 | 0.872882 | ⋯ |
3 | 0.611137 | 0.609566 | 0.60743 | 0.604767 | 0.601434 | 0.597316 | 0.592598 | 0.586992 | 0.580741 | 0.574189 | 0.566943 | 0.559381 | 0.551773 | 0.543753 | 0.535977 | 0.52818 | 0.520482 | 0.513455 | 0.506598 | 0.500236 | 0.494469 | 0.488922 | 0.483579 | 0.478512 | 0.473364 | 0.468619 | 0.463914 | 0.459365 | 0.455331 | 0.451357 | 0.447738 | 0.444393 | 0.4412 | 0.43831 | 0.435652 | 0.433147 | 0.431142 | 0.429417 | 0.428137 | 0.427358 | 0.427051 | 0.427196 | 0.427831 | 0.428873 | 0.43028 | 0.432035 | 0.434176 | 0.436401 | 0.438986 | 0.441528 | 0.444208 | 0.446958 | 0.449588 | 0.452339 | 0.455044 | 0.457599 | 0.460146 | 0.462568 | 0.464761 | 0.466901 | 0.468719 | 0.47045 | 0.472001 | 0.473309 | 0.47453 | 0.475624 | 0.476567 | 0.477469 | 0.478218 | 0.478939 | 0.479589 | 0.480172 | 0.480771 | 0.481325 | 0.48187 | 0.482473 | 0.483049 | 0.483702 | 0.484401 | 0.485088 | 0.485835 | 0.486543 | 0.487239 | 0.487919 | 0.488478 | 0.488971 | 0.489365 | 0.489597 | 0.489697 | 0.48958 | 0.489309 | 0.488807 | 0.488128 | 0.487165 | 0.485927 | 0.484502 | 0.48277 | 0.480747 | 0.47851 | 0.475817 | ⋯ |
Y = dat.Y @head Y
... (4075, 4)
Row | typ | label | conc | test |
---|---|---|---|---|
String | String | Float64 | Int64 | |
1 | FRG | wheat (ung) | 12.74 | 0 |
2 | MPW | milk powder & whey | 35.7212 | 0 |
3 | FRG | wheat (ung) | 12.0 | 0 |
wlst = names(X) wl = parse.(Int, wlst)
680-element Vector{Int64}: 1120 1122 1124 1126 1128 1130 1132 1134 1136 1138 ⋮ 2462 2464 2466 2468 2470 2472 2474 2476 2478
plotsp(X, wl; nsamp = 500, xlabel = "Wavelength (nm)").f
Preprocessing by SNV and derivation
model1 = snv() model2 = savgol(npoint = 21, deriv = 2, degree = 3) model = pip(model1, model2) fit!(model, X) Xp = transf(model, X) @head Xp
3×680 Matrix{Float64}: -0.00393533 -0.00441755 -0.00477681 … 0.000514478 0.000481081 -0.00121436 -0.0013095 -0.00135921 0.000996321 0.000930884 -0.00355712 -0.00399785 -0.00434838 0.00044084 0.000421751 ... (4075, 680)
plotsp(Xp, wl; nsamp = 500, xlabel = "Wavelength (nm)").f
Types of materials
typ = Y.typ freqtable(string.(typ, " - ", Y.label))
10-element Named Vector{Int64} Dim1 │ ──────────────────────────┼──── ANF - animal feed │ 391 CLZ - rapeseed(ung) │ 420 CNG - corn gluten │ 395 EHH - grass silage │ 422 FFS - full fat soya │ 432 FRG - wheat (ung) │ 411 MPW - milk powder & whey │ 410 PEE - maize wp │ 407 SFG - sun flower seed(gr) │ 281 TTS - soya meal │ 506
test = Y.test # training/test (0/1) observations tab(test) freqtable(typ, test)
10×2 Named Matrix{Int64} Dim1 ╲ Dim2 │ 0 1 ────────────┼───────── ANF │ 351 40 CLZ │ 378 42 CNG │ 356 39 EHH │ 380 42 FFS │ 397 35 FRG │ 371 40 MPW │ 372 38 PEE │ 367 40 SFG │ 272 9 TTS │ 457 49
s = Bool.(test) # same as: s = Y.test .== 1 Xtrain = rmrow(Xp, s) typtrain = rmrow(typ, s) Xtest = Xp[s, :] typtest = typ[s] ntot = nro(X) ntrain = nro(Xtrain) ntest = nro(Xtest) (ntot = ntot, ntrain, ntest)
(ntot = 4075, ntrain = 3701, ntest = 374)
nlv = 3 n_neighbors = 40; min_dist = .4 model = umap(; nlv, n_neighbors, min_dist) #model = umap(; nlv, n_neighbors, min_dist, psamp = .5) # faster but less accurate fit!(model, Xtrain) @head T = model.fitm.T
3×3 Matrix{Float64}: -4.75915 -7.84643 1.98314 2.63511 -7.86272 0.280644 -5.18164 -6.76126 1.25957 ... (3701, 3)
CairoMakie.activate!() #GLMakie.activate!() # for interactive axe-rotation
i = 1 plotxyz(T[:, i], T[:, i + 1], T[:, i + 2]; size = (600, 500), color = (:red, .3), markersize = 10, xlabel = string("LV", i), ylabel = string("LV", i + 1), zlabel = string("LV", i + 2), title = "Umap score space").f
lev = mlev(typtrain) nlev = length(lev) colm = cgrad(:tab10, nlev; categorical = true, alpha = .5) i = 1 plotxyz(T[:, i], T[:, i + 1], T[:, i + 2], typtrain; size = (700, 500), color = colm, markersize = 10, xlabel = string("LV", i), ylabel = string("LV", i + 1), zlabel = string("LV", i + 2), title = "Umap score space").f
@head Ttest = transf(model, Xtest)
3×3 Matrix{Float64}: 0.406886 10.6959 0.884709 3.03776 -3.52243 10.3022 -5.89002 -5.34931 3.1894 ... (374, 3)
f, ax = plotxyz(T[:, i], T[:, i + 1], T[:, i + 2], typtrain; size = (700, 500), color = colm, markersize = 10, xlabel = string("LV", i), ylabel = string("LV", i + 1), zlabel = string("LV", i + 2), title = "Umap score space") scatter!(ax, Ttest[:, i], Ttest[:, i + 1], Ttest[:, i + 2]; markersize = 6, color = :black) f