Centering a base learner by another one — BaselearnerCentered • compboost

This base learner subtracts the effect of two base learners (usually defined on the same feature). By subtracting the effects, one is not able to predict the other one. This becomes handy for decomposing effects into, e.g., a linear and non-linear component in which the non-linear component is not capable to capture the linear part and hence is selected after the linear effect is estimated.

Format

S4 object.

Fields

This class doesn't contain public fields.

Methods

$summarizeFactory(): () -> ()
$transfromData(newdata): list(InMemoryData) -> matrix()
$getMeta(): () -> list()
$getRotation(): () -> matrix()

Inherited methods from Baselearner

$getData(): () -> matrix()
$getDF(): () -> integer()
$getPenalty(): () -> numeric()
$getPenaltyMat(): () -> matrix()
$getFeatureName(): () -> character()
$getModelName(): () -> character()
$getBaselearnerId(): () -> character()

Examples

# Sample data:
x = runif(100, 0, 10)
y = 2 * sin(x) + 2 * x + rnorm(100, 0, 0.5)
dat = data.frame(x, y)

# S4 wrapper

# Create new data object, a matrix is required as input:
data_mat = cbind(x)
data_source = InMemoryData$new(data_mat, "x")

# Prerequisite: Create a linear and spline base learner:
bl_lin = BaselearnerPolynomial$new(data_source,
  list(degree = 1, intercept = TRUE))
bl_sp = BaselearnerPSpline$new(data_source,
  list(n_knots = 15, df = 5))

# Now, subtract the linear effect from the spline:
bl_ctr = BaselearnerCentered$new(bl_sp, bl_lin, "ctr")

# Recognize, that the data matrix of this base learner has
# `nrow(bl_sp$getData()) - ncol(bl_lin$getData())` columns:
dim(bl_ctr$getData())
#> [1] 100  17
str(bl_ctr$getMeta())
#> List of 4
#>  $ df         : num [1, 1] 5
#>  $ penalty    : num [1, 1] 53.3
#>  $ penalty_mat: num [1:17, 1:17] 7.963 -3.366 1.699 0.388 0.031 ...
#>  $ rotation   : num [1:19, 1:17] -0.229 -0.409 0.82 -0.171 -0.143 ...

# The data matrix is created by rotating the spline data matrix:
all.equal(t(bl_sp$getData()) %*% bl_ctr$getRotation(), bl_ctr$getData())
#> [1] TRUE

# Transform "new data". Internally, the basis of the spline is build and
# then rotated by the rotation matrix to subtract the linear part:
newdata = list(InMemoryData$new(cbind(rnorm(5)), "x"))
bl_ctr$transformData(newdata)
#> $design
#>             [,1]        [,2]       [,3]       [,4]       [,5]        [,6]
#> [1,]  0.45245382 -0.02044669 -0.1676660 -0.1632974 -0.1411660 -0.07238730
#> [2,] -0.17452332 -0.31839591 -0.2601975 -0.2471355 -0.2071931 -0.10173690
#> [3,]  0.03083665 -0.29021836 -0.2382564 -0.2227442 -0.1830025 -0.08716185
#> [4,] -0.17452332 -0.31839591 -0.2601975 -0.2471355 -0.2071931 -0.10173690
#> [5,] -0.17452332 -0.31839591 -0.2601975 -0.2471355 -0.2071931 -0.10173690
#>             [,7]        [,8]         [,9]      [,10]      [,11]      [,12]
#> [1,] -0.05628075 -0.04945426 -0.012491200 0.01188726 0.04122061 0.03383859
#> [2,] -0.06692038 -0.05063859  0.003403324 0.05818001 0.12044145 0.08405569
#> [3,] -0.04972386 -0.03159652  0.016010297 0.07575596 0.14223951 0.09481586
#> [4,] -0.06692038 -0.05063859  0.003403324 0.05818001 0.12044145 0.08405569
#> [5,] -0.06692038 -0.05063859  0.003403324 0.05818001 0.12044145 0.08405569
#>           [,13]      [,14]      [,15]      [,16]       [,17]
#> [1,] 0.05679459 0.08130003 0.08399657 0.07315466 0.009436883
#> [2,] 0.12287655 0.17067536 0.17015377 0.14617169 0.018774337
#> [3,] 0.13217194 0.18146835 0.17832783 0.15231835 0.019527924
#> [4,] 0.12287655 0.17067536 0.17015377 0.14617169 0.018774337
#> [5,] 0.12287655 0.17067536 0.17015377 0.14617169 0.018774337
#> 

# R6 wrapper

# Compboost has a wrapper called `$addComponents()` that automatically
cboost = Compboost$new(dat, "y")

# creates and adds the linear base learner and a centered base learner
# as above (the `...` args are passed to `BaselearnerPSpline$new():
cboost$addComponents("x", n_knots = 10, df = 5, bin_root = 2)

# Note that we have used binning to save memory, hence the data matrix
# is reduced to 10 observations:
dim(cboost$baselearner_list$x_x_spline_centered$factory$getData())
#> [1] 10 12

cboost$train(200, 0)
#> Train 200 iterations in 0 Seconds.
#> Final risk based on the train set: 0.33
#> 

library(ggplot2)

plotPEUni(cboost, "x") +
  geom_point(data = dat, aes(x = x, y = y - c(cboost$offset)), alpha = 0.2)