Storing the complete [Compboost] object requires to save a lot of data:
- Data matrices of the raw data.
- Transformed data matrices. Each base learner creates a design matrix with (potentially) multiple columns.
Hence, compboost
allows to store the model without the
data. Within this vignettes, this is also called production mode since
it is the more practical case when running the model in production.
Store model without data
To do so, just call:
dat = mlr3::tsk("sonar")$data()
cboost = boostSplines(dat, "Class", oob_fraction = 0.3)
#> 1/100 risk = 0.68 oob_risk = 0.69 time = 0
#> 2/100 risk = 0.68 oob_risk = 0.69 time = 1347
#> 4/100 risk = 0.67 oob_risk = 0.68 time = 3438
#> 6/100 risk = 0.67 oob_risk = 0.67 time = 5624
#> 8/100 risk = 0.66 oob_risk = 0.67 time = 55821
#> 10/100 risk = 0.66 oob_risk = 0.66 time = 58734
#> 12/100 risk = 0.65 oob_risk = 0.66 time = 60877
#> 14/100 risk = 0.64 oob_risk = 0.66 time = 63297
#> 16/100 risk = 0.64 oob_risk = 0.65 time = 65593
#> 18/100 risk = 0.63 oob_risk = 0.65 time = 67773
#> 20/100 risk = 0.63 oob_risk = 0.65 time = 70172
#> 22/100 risk = 0.62 oob_risk = 0.64 time = 72525
#> 24/100 risk = 0.62 oob_risk = 0.64 time = 74686
#> 26/100 risk = 0.62 oob_risk = 0.64 time = 77008
#> 28/100 risk = 0.61 oob_risk = 0.64 time = 79352
#> 30/100 risk = 0.61 oob_risk = 0.63 time = 81503
#> 32/100 risk = 0.6 oob_risk = 0.63 time = 83852
#> 34/100 risk = 0.6 oob_risk = 0.63 time = 86210
#> 36/100 risk = 0.6 oob_risk = 0.63 time = 88377
#> 38/100 risk = 0.59 oob_risk = 0.63 time = 90541
#> 40/100 risk = 0.59 oob_risk = 0.63 time = 92934
#> 42/100 risk = 0.58 oob_risk = 0.62 time = 95322
#> 44/100 risk = 0.58 oob_risk = 0.62 time = 97583
#> 46/100 risk = 0.58 oob_risk = 0.62 time = 99907
#> 48/100 risk = 0.57 oob_risk = 0.62 time = 102335
#> 50/100 risk = 0.57 oob_risk = 0.62 time = 104637
#> 52/100 risk = 0.57 oob_risk = 0.62 time = 106788
#> 54/100 risk = 0.56 oob_risk = 0.62 time = 109006
#> 56/100 risk = 0.56 oob_risk = 0.61 time = 111298
#> 58/100 risk = 0.56 oob_risk = 0.61 time = 113548
#> 60/100 risk = 0.55 oob_risk = 0.61 time = 115785
#> 62/100 risk = 0.55 oob_risk = 0.61 time = 118536
#> 64/100 risk = 0.55 oob_risk = 0.61 time = 120680
#> 66/100 risk = 0.54 oob_risk = 0.61 time = 123637
#> 68/100 risk = 0.54 oob_risk = 0.6 time = 126100
#> 70/100 risk = 0.54 oob_risk = 0.6 time = 128294
#> 72/100 risk = 0.54 oob_risk = 0.6 time = 130417
#> 74/100 risk = 0.53 oob_risk = 0.6 time = 132645
#> 76/100 risk = 0.53 oob_risk = 0.6 time = 134853
#> 78/100 risk = 0.53 oob_risk = 0.6 time = 137113
#> 80/100 risk = 0.52 oob_risk = 0.6 time = 139216
#> 82/100 risk = 0.52 oob_risk = 0.6 time = 141446
#> 84/100 risk = 0.52 oob_risk = 0.6 time = 143843
#> 86/100 risk = 0.52 oob_risk = 0.59 time = 146000
#> 88/100 risk = 0.51 oob_risk = 0.59 time = 148231
#> 90/100 risk = 0.51 oob_risk = 0.59 time = 150600
#> 92/100 risk = 0.51 oob_risk = 0.59 time = 152805
#> 94/100 risk = 0.51 oob_risk = 0.59 time = 155026
#> 96/100 risk = 0.5 oob_risk = 0.59 time = 157181
#> 98/100 risk = 0.5 oob_risk = 0.59 time = 159356
#> 100/100 risk = 0.5 oob_risk = 0.59 time = 161484
#>
#>
#> Train 100 iterations in 0 Seconds.
#> Final risk based on the train set: 0.5
file = "cboost.json"
cboost$saveToJson(file, rm_data = TRUE)
cboost_without_data = Compboost$new(file = file)
# The data field now just contains a dummy:
cboost_without_data$data
#> V1 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V2 V20 V21 V22 V23 V24 V25 V26 V27
#> 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#> V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V4 V40 V41 V42 V43 V44 V45
#> 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#> V46 V47 V48 V49 V5 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V6 V60 V7 V8 V9
#> 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Note: It is not possible to use any functionality
that requires the training data when storing and loading the object
without data. For example, cboost$predict()
now throws an
error:
cboost_without_data$predict()
#> Error in eval(expr, envir, enclos): Production mode is on, this does not allow prediction on training data and hence also blocks the continuation of the training. This is most likely because the training data was removed to either store memory or due to privacy reasons.
Functionality of a data free model
The most important functions are still usable:
Predict on new data
ndat = dat[1:10, ]
cboost_without_data$predict(ndat)
#> [,1]
#> [1,] -0.316230234
#> [2,] -0.578806219
#> [3,] -0.671616251
#> [4,] -0.246418533
#> [5,] -0.265362743
#> [6,] -0.186142444
#> [7,] -0.113103124
#> [8,] 0.534978693
#> [9,] 0.006797237
#> [10,] -0.791081934
Visualize partial feature effects.
library(patchwork)
# Use most important base learner:
bln = vip$baselearner[1]
plotBaselearner(cboost_without_data, bln) +
plotPEUni(cboost_without_data, strsplit(bln, "_")[[1]][1])
Get logger data
head(cboost_without_data$getLoggerData())
#> _iterations oob_risk time baselearner train_risk
#> 1 0 NA NA intercept 0.6885426
#> 2 1 0.6898895 0 V12_spline 0.6848696
#> 3 2 0.6867225 1347 V12_spline 0.6812831
#> 4 3 0.6836441 2419 V12_spline 0.6777811
#> 5 4 0.6806519 3438 V12_spline 0.6743615
#> 6 5 0.6777437 4513 V12_spline 0.6710224
Setting the model to a previous iteration.
table(cboost_without_data$getSelectedBaselearner())
#>
#> V12_spline V13_spline V21_spline V23_spline V36_spline V49_spline V52_spline
#> 34 8 20 5 3 12 3
#> V9_spline
#> 15
cboost_without_data$predict(ndat)
#> [,1]
#> [1,] -0.316230234
#> [2,] -0.578806219
#> [3,] -0.671616251
#> [4,] -0.246418533
#> [5,] -0.265362743
#> [6,] -0.186142444
#> [7,] -0.113103124
#> [8,] 0.534978693
#> [9,] 0.006797237
#> [10,] -0.791081934
# State after 50 iteration:
cboost_without_data$train(50)
table(cboost_without_data$getSelectedBaselearner())
#>
#> V12_spline V13_spline V21_spline V49_spline V9_spline
#> 27 1 12 1 9
cboost_without_data$predict(ndat)
#> [,1]
#> [1,] -0.147961460
#> [2,] -0.266152590
#> [3,] -0.473792288
#> [4,] -0.098510646
#> [5,] -0.007900711
#> [6,] 0.091077307
#> [7,] -0.021964105
#> [8,] 0.510638052
#> [9,] -0.044479267
#> [10,] -0.478230886
Advantages
Loading
Loading a model is much faster (maybe not that striking for smaller models):
system.time(Compboost$new(file = file))
#> user system elapsed
#> 0.135 0.000 0.136
system.time(Compboost$new(file = file_full))
#> user system elapsed
#> 0.195 0.000 0.195