Skip to content

Commit 48d206d

Browse files
authored
Merge pull request #109 from mlr-org/mtry.ratio
Support mtry.ratio and sampsize.ratio
2 parents ce33c22 + 7c93fa7 commit 48d206d

20 files changed

+288
-92
lines changed

.ignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
man/
2+
attic/
3+
pkgdown/
4+
revdep/
5+
docs/

DESCRIPTION

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: mlr3extralearners
22
Title: Extra Learners For mlr3
3-
Version: 0.5.6
3+
Version: 0.5.7
44
Authors@R:
55
c(person(given = "Raphael",
66
family = "Sonabend",
@@ -44,7 +44,7 @@ Imports:
4444
data.table,
4545
methods,
4646
mlr3 (>= 0.6.0),
47-
mlr3misc,
47+
mlr3misc (>= 0.9.4),
4848
paradox,
4949
R6
5050
Suggests:
@@ -108,4 +108,4 @@ Config/testthat/edition: 3
108108
Encoding: UTF-8
109109
NeedsCompilation: no
110110
Roxygen: list(markdown = TRUE, r6 = TRUE)
111-
RoxygenNote: 7.1.1
111+
RoxygenNote: 7.1.2

NEWS.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1+
# mlr3extralearners 0.5.7
2+
3+
* Introduced new custom hyperparameters for `randomForestSRC::rfsrc()`,
4+
`partykit::cforest()` and `obliqueRSF::ORSF()` to conveniently tune
5+
hyperparameters whose upper limit depends on data dimensions.
6+
17
# mlr3extralearners 0.5.6
28

39
* Fix learners requiring distr6. distr6 1.6.0 now forced and param6 added to suggests
410

11+
512
# mlr3extralearners 0.5.5
613

714
* Bugfix `regr.gausspr`

R/bibentries.R

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
bibentries = c( # nolint start
2+
breiman_2001 = bibentry("article",
3+
title = "Random Forests",
4+
author = "Breiman, Leo",
5+
year = "2001",
6+
journal = "Machine Learning",
7+
volume = "45",
8+
number = "1",
9+
pages = "5--32",
10+
doi = "10.1023/A:1010933404324",
11+
issn = "1573-0565"
12+
),
13+
14+
ishwaran_2008 = bibentry("article",
15+
doi = "10.1214/08-aoas169",
16+
url = "https://doi.org/10.1214/08-aoas169",
17+
year = "2008",
18+
month = "9",
19+
publisher = "Institute of Mathematical Statistics",
20+
volume = "2",
21+
number = "3",
22+
author = "Hemant Ishwaran and Udaya B. Kogalur and Eugene H. Blackstone and Michael S. Lauer",
23+
title = "Random survival forests",
24+
journal = "The Annals of Applied Statistics"
25+
),
26+
27+
hothorn_2015 = bibentry("article",
28+
author = "Torsten Hothorn and Achim Zeileis",
29+
title = "partykit: A Modular Toolkit for Recursive Partytioning in R",
30+
journal = "Journal of Machine Learning Research",
31+
year = "2015",
32+
volume = "16",
33+
number = "118",
34+
pages = "3905-3909",
35+
url = "http://jmlr.org/papers/v16/hothorn15a.html"
36+
),
37+
38+
hothorn_2006 = bibentry("article",
39+
doi = "10.1198/106186006x133933",
40+
url = "https://doi.org/10.1198/106186006x133933",
41+
year = "2006",
42+
month = "9",
43+
publisher = "Informa {UK} Limited",
44+
volume = "15",
45+
number = "3",
46+
pages = "651--674",
47+
author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis",
48+
title = "Unbiased Recursive Partitioning: A Conditional Inference Framework",
49+
journal = "Journal of Computational and Graphical Statistics"
50+
),
51+
52+
jaeger_2019 = bibentry("article",
53+
doi = "10.1214/19-aoas1261",
54+
year = "2019",
55+
month = "9",
56+
publisher = "Institute of Mathematical Statistics",
57+
volume = "13",
58+
number = "3",
59+
author = "Byron C. Jaeger and D. Leann Long and Dustin M. Long and Mario Sims and Jeff M. Szychowski and Yuan-I Min and Leslie A. Mcclure and George Howard and Noah Simon",
60+
title = "Oblique random survival forests",
61+
journal = "The Annals of Applied Statistics"
62+
)
63+
) # nolint end

R/helpers.R

+37
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,40 @@ pprob_to_matrix <- function(pp, task) {
5757
colnames(y) <- task$class_names
5858
y
5959
}
60+
61+
#' @title Convert a Ratio Hyperparameter
62+
#'
63+
#' @description
64+
#' Given the named list `pv` (values of a [ParamSet]), converts a possibly provided hyperparameter
65+
#' called `ratio` to an integer hyperparameter `target`.
66+
#' If both are found in `pv`, an exception is thrown.
67+
#'
68+
#' @param pv (named `list()`).
69+
#' @param target (`character(1)`)\cr
70+
#' Name of the integer hyperparameter.
71+
#' @param ratio (`character(1)`)\cr
72+
#' Name of the ratio hyperparameter.
73+
#' @param n (`integer(1)`)\cr
74+
#' Ratio of what?
75+
#'
76+
#' @return (named `list()`) with new hyperparameter settings.
77+
#' @noRd
78+
convert_ratio = function(pv, target, ratio, n) {
79+
switch(to_decimal(c(target, ratio) %in% names(pv)) + 1L,
80+
# !mtry && !mtry.ratio
81+
pv,
82+
83+
# !mtry && mtry.ratio
84+
{
85+
pv[[target]] = max(ceiling(pv[[ratio]] * n), 1)
86+
remove_named(pv, ratio)
87+
},
88+
89+
90+
# mtry && !mtry.ratio
91+
pv,
92+
93+
# mtry && mtry.ratio
94+
stopf("Hyperparameters '%s' and '%s' are mutually exclusive", target, ratio)
95+
)
96+
}

R/learner_obliqueRSF_surv_obliqueRSF.R

+8-5
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
#' - Actual default: `TRUE`
1313
#' - Adjusted default: `FALSE`
1414
#' - Reason for change: mlr3 already has it's own verbose set to `TRUE` by default
15+
#' - `mtry`:
16+
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtry_ratio`
17+
#' as `mtry = max(ceiling(mtry_ratio * n_features), 1)`.
18+
#' Note that `mtry` and `mtry_ratio` are mutually exclusive.
1519
#'
1620
#' @references
17-
#' Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min Y, Mcclure LA, Howard G, Simon N (2019).
18-
#' “Oblique random survival forests.” The Annals of Applied Statistics, 13(3), 1847–1883.
19-
#' ISSN 1932-6157, 1941-7330, doi: 10.1214/19-AOAS1261,
20-
#' https://projecteuclid.org/euclid.aoas/1571277776.
21+
#' `r format_bib("jaeger_2019")`
2122
#'
2223
#' @template seealso_learner
2324
#' @template example
@@ -42,6 +43,7 @@ LearnerSurvObliqueRSF = R6Class("LearnerSurvObliqueRSF",
4243
max_pval_to_split_node = p_dbl(lower = 0, upper = 1, default = 0.5,
4344
tags = "train"),
4445
mtry = p_int(lower = 1, tags = "train"),
46+
mtry_ratio = p_dbl(0, 1, tags = "train"),
4547
dfmax = p_int(lower = 1, tags = "train"),
4648
use.cv = p_lgl(default = FALSE, tags = "train"),
4749
verbose = p_lgl(default = TRUE, tags = "train"),
@@ -76,11 +78,12 @@ LearnerSurvObliqueRSF = R6Class("LearnerSurvObliqueRSF",
7678
private = list(
7779
.train = function(task) {
7880
pv = self$param_set$get_values(tags = "train")
81+
pv = convert_ratio(pv, "mtry", "mtry_ratio", length(task$feature_names))
7982
targets = task$target_names
8083

8184
mlr3misc::invoke(
8285
obliqueRSF::ORSF,
83-
data = as.data.frame(task$data()),
86+
data = data.table::setDF(task$data()),
8487
time = targets[1L],
8588
status = targets[2L],
8689
.args = pv

R/learner_partykit_classif_cforest.R

+9-9
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,14 @@
66
#' @templateVar id classif.cforest
77
#' @templateVar caller cforest
88
#'
9-
#' @references
10-
#' Hothorn T, Zeileis A (2015).
11-
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
12-
#' Journal of Machine Learning Research, 16(118), 3905-3909.
13-
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
9+
#' @section Custom mlr3 defaults:
10+
#' - `mtry`:
11+
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtryratio`
12+
#' as `mtry = max(ceiling(mtryratio * n_features), 1)`.
13+
#' Note that `mtry` and `mtryratio` are mutually exclusive.
1414
#'
15-
#' Hothorn T, Hornik K, Zeileis A (2006).
16-
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
17-
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
18-
#' \doi{10.1198/106186006x133933}
15+
#' @references
16+
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
1917
#'
2018
#' @export
2119
#' @template seealso_learner
@@ -37,6 +35,7 @@ LearnerClassifCForest = R6Class("LearnerClassifCForest",
3735
tags = "train"),
3836
mtry = p_int(lower = 0L, special_vals = list(Inf),
3937
tags = "train"), # default actually "ceiling(sqrt(nvar))"
38+
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
4039
applyfun = p_uty(tags = c("train", "importance")),
4140
cores = p_int(default = NULL, special_vals = list(NULL),
4241
tags = c("train", "importance")),
@@ -167,6 +166,7 @@ LearnerClassifCForest = R6Class("LearnerClassifCForest",
167166
.train = function(task) {
168167

169168
pars = self$param_set$get_values(tags = "train")
169+
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))
170170
pars_control = pars[which(names(pars) %in%
171171
setdiff(methods::formalArgs(partykit::ctree_control),
172172
c("mtry", "applyfun", "cores")

R/learner_partykit_regr_cforest.R

+5-9
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,10 @@
66
#' @templateVar id regr.cforest
77
#' @templateVar caller cforest
88
#'
9-
#' @references
10-
#' Hothorn T, Zeileis A (2015).
11-
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
12-
#' Journal of Machine Learning Research, 16(118), 3905-3909.
13-
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
9+
#' @inheritSection mlr_learners_classif.cforest Custom mlr3 defaults
1410
#'
15-
#' Hothorn T, Hornik K, Zeileis A (2006).
16-
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
17-
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
18-
#' \doi{10.1198/106186006x133933}
11+
#' @references
12+
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
1913
#'
2014
#' @export
2115
#' @template seealso_learner
@@ -37,6 +31,7 @@ LearnerRegrCForest = R6Class("LearnerRegrCForest",
3731
tags = "train"),
3832
mtry = p_int(lower = 0L, special_vals = list(Inf),
3933
tags = "train"), # default actually "ceiling(sqrt(nvar))"
34+
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
4035
applyfun = p_uty(tags = c("train", "importance")),
4136
cores = p_int(default = NULL, special_vals = list(NULL),
4237
tags = c("train", "importance")),
@@ -163,6 +158,7 @@ LearnerRegrCForest = R6Class("LearnerRegrCForest",
163158
.train = function(task) {
164159

165160
pars = self$param_set$get_values(tags = "train")
161+
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))
166162
pars_control = pars[which(names(pars) %in%
167163
setdiff(methods::formalArgs(partykit::ctree_control),
168164
c("mtry", "applyfun", "cores")

R/learner_partykit_surv_cforest.R

+5-9
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,10 @@
66
#' @templateVar id surv.cforest
77
#' @templateVar caller cforest
88
#'
9-
#' @references
10-
#' Hothorn T, Zeileis A (2015).
11-
#' “partykit: A Modular Toolkit for Recursive Partytioning in R.”
12-
#' Journal of Machine Learning Research, 16(118), 3905-3909.
13-
#' \url{http://jmlr.org/papers/v16/hothorn15a.html}
9+
#' @inheritSection mlr_learners_classif.cforest Custom mlr3 defaults
1410
#'
15-
#' Hothorn T, Hornik K, Zeileis A (2006).
16-
#' “Unbiased Recursive Partitioning: A Conditional Inference Framework.”
17-
#' Journal of Computational and Graphical Statistics, 15(3), 651–674.
18-
#' \doi{10.1198/106186006x133933}
11+
#' @references
12+
#' `r format_bib(c("hothorn_2015", "hothorn_2006"))
1913
#'
2014
#' @export
2115
#' @template seealso_learner
@@ -34,6 +28,7 @@ LearnerSurvCForest = R6Class("LearnerSurvCForest",
3428
tags = c("train", "perturb")),
3529
mtry = p_int(lower = 0L, special_vals = list(Inf),
3630
tags = "train"), # default actually "ceiling(sqrt(nvar))"
31+
mtryratio = p_dbl(lower = 0, upper = 1, tags = "train"),
3732
applyfun = p_uty(tags = c("train", "importance")),
3833
cores = p_int(default = NULL, special_vals = list(NULL),
3934
tags = c("train", "importance")),
@@ -127,6 +122,7 @@ LearnerSurvCForest = R6Class("LearnerSurvCForest",
127122
.train = function(task) {
128123

129124
pars = self$param_set$get_values(tags = "train")
125+
pars = convert_ratio(pars, "mtry", "mtryratio", length(task$feature_names))
130126

131127
if ("weights" %in% task$properties) {
132128
pars$weights = task$weights$weight

R/learner_randomForestSRC_classif_rfsrc.R

+13-2
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,17 @@
1111
#' - Actual default: Auto-detecting the number of cores
1212
#' - Adjusted default: 1
1313
#' - Reason for change: Threading conflicts with explicit parallelization via \CRANpkg{future}.
14+
#' - `mtry`:
15+
#' - This hyperparameter can alternatively be set via the added hyperparameter `mtry.ratio`
16+
#' as `mtry = max(ceiling(mtry.ratio * n_features), 1)`.
17+
#' Note that `mtry` and `mtry.ratio` are mutually exclusive.
18+
#' - `sampsize`:
19+
#' - This hyperparameter can alternatively be set via the added hyperparameter `sampsize.ratio`
20+
#' as `sampsize = max(ceiling(sampsize.ratio * n_obs), 1)`.
21+
#' Note that `sampsize` and `sampsize.ratio` are mutually exclusive.
1422
#'
1523
#' @references
16-
#' Breiman L (2001). “Random Forests.”
17-
#' Machine Learning, 45(1), 5–32. ISSN 1573-0565, doi: 10.1023/A:1010933404324.
24+
#' `r format_bib("breiman_2001")`
1825
#'
1926
#' @template seealso_learner
2027
#' @template example
@@ -29,6 +36,7 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
2936
ps = ps(
3037
ntree = p_int(default = 1000, lower = 1L, tags = c("train", "predict")),
3138
mtry = p_int(lower = 1L, tags = "train"),
39+
mtry.ratio = p_dbl(lower = 0, upper = 1, tags = "train"),
3240
nodesize = p_int(default = 15L, lower = 1L, tags = "train"),
3341
nodedepth = p_int(lower = 1L, tags = "train"),
3442
splitrule = p_fct(
@@ -52,6 +60,7 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
5260
samp = p_uty(tags = "train"),
5361
membership = p_lgl(default = FALSE, tags = c("train", "predict")),
5462
sampsize = p_uty(tags = "train"),
63+
sampsize.ratio = p_dbl(0, 1, tags = "train"),
5564
na.action = p_fct(
5665
default = "na.omit", levels = c("na.omit", "na.impute"),
5766
tags = c("train", "predict")),
@@ -140,6 +149,8 @@ LearnerClassifRandomForestSRC = R6Class("LearnerClassifRandomForestSRC",
140149
private = list(
141150
.train = function(task) {
142151
pv = self$param_set$get_values(tags = "train")
152+
pv = convert_ratio(pv, "mtry", "mtry.ratio", length(task$feature_names))
153+
pv = convert_ratio(pv, "sampsize", "sampsize.ratio", task$nrow)
143154
cores = pv$cores %??% 1L
144155

145156
if ("weights" %in% task$properties) {

R/learner_randomForestSRC_regr_rfsrc.R

+6-7
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,10 @@
66
#' @templateVar id regr.rfsrc
77
#' @templateVar caller rfsrc
88
#'
9-
#' @section Custom mlr3 defaults:
10-
#' - `cores`:
11-
#' - Actual default: Auto-detecting the number of cores
12-
#' - Adjusted default: 1
13-
#' - Reason for change: Threading conflicts with explicit parallelization via \CRANpkg{future}.
9+
#' @inheritSection mlr_learners_classif.rfsrc Custom mlr3 defaults
1410
#'
1511
#' @references
16-
#' Breiman L (2001). “Random Forests.”
17-
#' Machine Learning, 45(1), 5–32. ISSN 1573-0565, \doi{10.1023/A:1010933404324}
12+
#' `r format_bib("breiman_2001")`
1813
#'
1914
#' @template seealso_learner
2015
#' @template example
@@ -29,6 +24,7 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
2924
ps = ps(
3025
ntree = p_int(default = 1000, lower = 1L, tags = c("train", "predict")),
3126
mtry = p_int(lower = 1L, tags = "train"),
27+
mtry.ratio = p_dbl(lower = 0, upper = 1, tags = "train"),
3228
nodesize = p_int(default = 15L, lower = 1L, tags = "train"),
3329
nodedepth = p_int(lower = 1L, tags = "train"),
3430
splitrule = p_fct(
@@ -52,6 +48,7 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
5248
samp = p_uty(tags = "train"),
5349
membership = p_lgl(default = FALSE, tags = c("train", "predict")),
5450
sampsize = p_uty(tags = "train"),
51+
sampsize.ratio = p_dbl(0, 1, tags = "train"),
5552
na.action = p_fct(
5653
default = "na.omit", levels = c("na.omit", "na.impute"),
5754
tags = c("train", "predict")),
@@ -137,6 +134,8 @@ LearnerRegrRandomForestSRC = R6Class("LearnerRegrRandomForestSRC",
137134
private = list(
138135
.train = function(task) {
139136
pv = self$param_set$get_values(tags = "train")
137+
pv = convert_ratio(pv, "mtry", "mtry.ratio", length(task$feature_names))
138+
pv = convert_ratio(pv, "sampsize", "sampsize.ratio", task$nrow)
140139
cores = pv$cores %??% 1L
141140

142141
if ("weights" %in% task$properties) {

0 commit comments

Comments
 (0)