Production mode • compboost

Storing the complete [Compboost] object requires to save a lot of data:

Data matrices of the raw data.
Transformed data matrices. Each base learner creates a design matrix with (potentially) multiple columns.

Hence, compboost allows to store the model without the data. Within this vignettes, this is also called production mode since it is the more practical case when running the model in production.

Store model without data

To do so, just call:

dat = mlr3::tsk("sonar")$data()
cboost = boostSplines(dat, "Class", oob_fraction = 0.3)
#>   1/100   risk = 0.68  oob_risk = 0.69   time = 0   
#>   2/100   risk = 0.68  oob_risk = 0.69   time = 1347   
#>   4/100   risk = 0.67  oob_risk = 0.68   time = 3438   
#>   6/100   risk = 0.67  oob_risk = 0.67   time = 5624   
#>   8/100   risk = 0.66  oob_risk = 0.67   time = 55821   
#>  10/100   risk = 0.66  oob_risk = 0.66   time = 58734   
#>  12/100   risk = 0.65  oob_risk = 0.66   time = 60877   
#>  14/100   risk = 0.64  oob_risk = 0.66   time = 63297   
#>  16/100   risk = 0.64  oob_risk = 0.65   time = 65593   
#>  18/100   risk = 0.63  oob_risk = 0.65   time = 67773   
#>  20/100   risk = 0.63  oob_risk = 0.65   time = 70172   
#>  22/100   risk = 0.62  oob_risk = 0.64   time = 72525   
#>  24/100   risk = 0.62  oob_risk = 0.64   time = 74686   
#>  26/100   risk = 0.62  oob_risk = 0.64   time = 77008   
#>  28/100   risk = 0.61  oob_risk = 0.64   time = 79352   
#>  30/100   risk = 0.61  oob_risk = 0.63   time = 81503   
#>  32/100   risk = 0.6  oob_risk = 0.63   time = 83852   
#>  34/100   risk = 0.6  oob_risk = 0.63   time = 86210   
#>  36/100   risk = 0.6  oob_risk = 0.63   time = 88377   
#>  38/100   risk = 0.59  oob_risk = 0.63   time = 90541   
#>  40/100   risk = 0.59  oob_risk = 0.63   time = 92934   
#>  42/100   risk = 0.58  oob_risk = 0.62   time = 95322   
#>  44/100   risk = 0.58  oob_risk = 0.62   time = 97583   
#>  46/100   risk = 0.58  oob_risk = 0.62   time = 99907   
#>  48/100   risk = 0.57  oob_risk = 0.62   time = 102335   
#>  50/100   risk = 0.57  oob_risk = 0.62   time = 104637   
#>  52/100   risk = 0.57  oob_risk = 0.62   time = 106788   
#>  54/100   risk = 0.56  oob_risk = 0.62   time = 109006   
#>  56/100   risk = 0.56  oob_risk = 0.61   time = 111298   
#>  58/100   risk = 0.56  oob_risk = 0.61   time = 113548   
#>  60/100   risk = 0.55  oob_risk = 0.61   time = 115785   
#>  62/100   risk = 0.55  oob_risk = 0.61   time = 118536   
#>  64/100   risk = 0.55  oob_risk = 0.61   time = 120680   
#>  66/100   risk = 0.54  oob_risk = 0.61   time = 123637   
#>  68/100   risk = 0.54  oob_risk = 0.6   time = 126100   
#>  70/100   risk = 0.54  oob_risk = 0.6   time = 128294   
#>  72/100   risk = 0.54  oob_risk = 0.6   time = 130417   
#>  74/100   risk = 0.53  oob_risk = 0.6   time = 132645   
#>  76/100   risk = 0.53  oob_risk = 0.6   time = 134853   
#>  78/100   risk = 0.53  oob_risk = 0.6   time = 137113   
#>  80/100   risk = 0.52  oob_risk = 0.6   time = 139216   
#>  82/100   risk = 0.52  oob_risk = 0.6   time = 141446   
#>  84/100   risk = 0.52  oob_risk = 0.6   time = 143843   
#>  86/100   risk = 0.52  oob_risk = 0.59   time = 146000   
#>  88/100   risk = 0.51  oob_risk = 0.59   time = 148231   
#>  90/100   risk = 0.51  oob_risk = 0.59   time = 150600   
#>  92/100   risk = 0.51  oob_risk = 0.59   time = 152805   
#>  94/100   risk = 0.51  oob_risk = 0.59   time = 155026   
#>  96/100   risk = 0.5  oob_risk = 0.59   time = 157181   
#>  98/100   risk = 0.5  oob_risk = 0.59   time = 159356   
#> 100/100   risk = 0.5  oob_risk = 0.59   time = 161484   
#> 
#> 
#> Train 100 iterations in 0 Seconds.
#> Final risk based on the train set: 0.5

file = "cboost.json"
cboost$saveToJson(file, rm_data = TRUE)

cboost_without_data = Compboost$new(file = file)

# The data field now just contains a dummy:
cboost_without_data$data
#>   V1 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V2 V20 V21 V22 V23 V24 V25 V26 V27
#> 1  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0   0   0   0
#>   V28 V29 V3 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V4 V40 V41 V42 V43 V44 V45
#> 1   0   0  0   0   0   0   0   0   0   0   0   0   0  0   0   0   0   0   0   0
#>   V46 V47 V48 V49 V5 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V6 V60 V7 V8 V9
#> 1   0   0   0   0  0   0   0   0   0   0   0   0   0   0   0  0   0  0  0  0

Note: It is not possible to use any functionality that requires the training data when storing and loading the object without data. For example, cboost$predict() now throws an error:

cboost_without_data$predict()
#> Error in eval(expr, envir, enclos): Production mode is on, this does not allow prediction on training data and hence also blocks the continuation of the training. This is most likely because the training data was removed to either store memory or due to privacy reasons.

Functionality of a data free model

The most important functions are still usable:

Extracting feature importance.

vip = cboost_without_data$calculateFeatureImportance()

Predict on new data

ndat = dat[1:10, ]
cboost_without_data$predict(ndat)
#>               [,1]
#>  [1,] -0.316230234
#>  [2,] -0.578806219
#>  [3,] -0.671616251
#>  [4,] -0.246418533
#>  [5,] -0.265362743
#>  [6,] -0.186142444
#>  [7,] -0.113103124
#>  [8,]  0.534978693
#>  [9,]  0.006797237
#> [10,] -0.791081934

Visualize partial feature effects.

library(patchwork)

# Use most important base learner:
bln = vip$baselearner[1]
plotBaselearner(cboost_without_data, bln) +
plotPEUni(cboost_without_data, strsplit(bln, "_")[[1]][1])

Get logger data

head(cboost_without_data$getLoggerData())
#>   _iterations  oob_risk time baselearner train_risk
#> 1           0        NA   NA   intercept  0.6885426
#> 2           1 0.6898895    0  V12_spline  0.6848696
#> 3           2 0.6867225 1347  V12_spline  0.6812831
#> 4           3 0.6836441 2419  V12_spline  0.6777811
#> 5           4 0.6806519 3438  V12_spline  0.6743615
#> 6           5 0.6777437 4513  V12_spline  0.6710224

Setting the model to a previous iteration.

table(cboost_without_data$getSelectedBaselearner())
#> 
#> V12_spline V13_spline V21_spline V23_spline V36_spline V49_spline V52_spline 
#>         34          8         20          5          3         12          3 
#>  V9_spline 
#>         15
cboost_without_data$predict(ndat)
#>               [,1]
#>  [1,] -0.316230234
#>  [2,] -0.578806219
#>  [3,] -0.671616251
#>  [4,] -0.246418533
#>  [5,] -0.265362743
#>  [6,] -0.186142444
#>  [7,] -0.113103124
#>  [8,]  0.534978693
#>  [9,]  0.006797237
#> [10,] -0.791081934

# State after 50 iteration:
cboost_without_data$train(50)
table(cboost_without_data$getSelectedBaselearner())
#> 
#> V12_spline V13_spline V21_spline V49_spline  V9_spline 
#>         27          1         12          1          9
cboost_without_data$predict(ndat)
#>               [,1]
#>  [1,] -0.147961460
#>  [2,] -0.266152590
#>  [3,] -0.473792288
#>  [4,] -0.098510646
#>  [5,] -0.007900711
#>  [6,]  0.091077307
#>  [7,] -0.021964105
#>  [8,]  0.510638052
#>  [9,] -0.044479267
#> [10,] -0.478230886

Advantages

Size

The size of the model and JSON file is much smaller when the data is not stored.

file_full = "cboost_full.json"
cboost$saveToJson(file_full)

file.info(file)$size / 1024^2
#> [1] 1.310535
file.info(file_full)$size / 1024^2
#> [1] 3.352463

Loading

Loading a model is much faster (maybe not that striking for smaller models):

system.time(Compboost$new(file = file))
#>    user  system elapsed 
#>   0.135   0.000   0.136
system.time(Compboost$new(file = file_full))
#>    user  system elapsed 
#>   0.195   0.000   0.195

Privacy

Raw data is not shared unintentionally with third parties. This is especially striking for domains that works with sensitive data.