1
+ library(mlr )
2
+ library(OpenML )
3
+ # get the tasks for the experiment
4
+ tl = listOMLTasks()
5
+ # filter data sets and get appropriate data set IDs:
6
+ # classification
7
+ class.tasks = subset(tl , task_type == " Supervised Classification" & NumberOfFeatures < 15 &
8
+ NumberOfFeatures > 3 & NumberOfInstances < 100 & NumberOfMissingValues == 0 )
9
+ class.tasks $ estimation_procedure
10
+ # binaryclass
11
+ bin.ids = subset(class.tasks , NumberOfClasses == 2L )$ task_id
12
+ # multiclass
13
+ multiclass.ids = subset(class.tasks , NumberOfClasses > 2L & NumberOfClasses < 5L )$ task_id
14
+ # regression
15
+ regr.ids = subset(tl , task_type == " Supervised Regression" &
16
+ NumberOfFeatures < 15 & NumberOfFeatures > 3 & NumberOfInstances < 100 &
17
+ NumberOfMissingValues == 0 )$ task_id
18
+ # a list of task indices for each type
19
+ ids = list (bin = bin.ids , multiclass = multiclass.ids , regr = regr.ids )
20
+ # draw from each task type
21
+ cases = 1L
22
+ set.seed(getOption(" mlr.debug.seed" ))
23
+ ids = as.vector(lapply(ids , sample , size = cases ))
24
+ # get the tasks
25
+ tsks = lapply(ids , getOMLTask )
26
+ # convert them to mlr and get the task desc and the resample instances
27
+ tsks.mlr.info = sapply(tsks , convertOMLTaskToMlr , simplify = FALSE )
28
+ tsks.mlr = sapply(tsks.mlr.info , " [[" , " mlr.task" , simplify = FALSE )
29
+ rins = sapply(tsks.mlr.info , " [[" , " mlr.rin" , simplify = FALSE )
30
+
31
+ # #create the learners
32
+ lrns = list (bin = list (cl = " classif.rpart" , predict.type = " prob" ),
33
+ multiclass = list (" classif.rpart" , predict.type = " prob" ),
34
+ regr = list (" regr.rpart" ))
35
+ lrns = lapply(lrns , do.call , what = " makeLearner" )
36
+ # #upload learner and get implementation id
37
+ impl.ids = sapply(lrns , uploadOMLFlow , simplify = FALSE )
38
+ # #upload and get OML run
39
+ set.seed(getOption(" mlr.debug.seed" ))
40
+ # run the tasks
41
+ ran.tsks = mapply(runTaskMlr , tsks , lrns , SIMPLIFY = FALSE )
42
+ run.ids = mapply(uploadOMLRun , ran.tsks , impl.ids )
43
+ save.image(" datasetup.RData" )
44
+ # ########################################################################
45
+ # wait a bit before executing further, it takes some time
46
+ # for the measures to be available
47
+ runs = sapply(run.ids , getOMLRun , simplify = FALSE , verbosity = 0L )
48
+ # get all the calculated measures and store in list:
49
+ oml.measures = list ()
50
+ for ( i in 1 : length(runs )) {
51
+ oml.measures [[i ]] <- runs [[i ]]$ output.data $ evaluations [, c(" name" , " value" )]
52
+ }
53
+ # we need lists for the mlr-measures we want
54
+ cv.measures = list (bin = list (acc , auc , tpr , f1 , ppv ),
55
+ multiclass = list (acc , ber ),
56
+ regr = list (mae , rmse ))
57
+ # lets resample
58
+ cv.res = mapply(resample , lrns , tsks.mlr , rins , cv.measures ,
59
+ show.info = FALSE , SIMPLIFY = FALSE )
60
+ # We need the weighted mean, mlr$aggr is averaged over folds regardless of size.
61
+ # Bernd, what's the name of the function, that should be doing that for me?
62
+ # I wasn't able to find it
63
+ # So I need the number of obs in each fold
64
+ fold.sizes = lapply(rins , " [[" , " test.inds" )
65
+ fold.sizes = lapply(fold.sizes , FUN = function (x ) {
66
+ lapply(x , length )
67
+ })
68
+ fold.sizes = lapply(fold.sizes , unlist )
69
+ # then calculate the measures
70
+ mlr.measures = sapply(cv.res , " [[" , " measures.test" )
71
+ mlr.measures = mapply(FUN = function (x , y ) {
72
+ x $ fold.sizes = y
73
+ x [, 1L ] = NULL
74
+ x
75
+ }, mlr.measures , fold.sizes )
76
+ mlr.measures = lapply(mlr.measures , FUN = function (x ) {
77
+ apply(x , 2L , weighted.mean , w = x $ fold.sizes )
78
+ })
79
+ mlr.measures = lapply(mlr.measures , round , digits = 6L )
80
+ # ok now lets put this together to compare stuff:
81
+ # binary
82
+ measures.bin = data.frame (measure = character (5L ), mlr.val = numeric (5L ),
83
+ oml.val = character (5L ), stringsAsFactors = FALSE )
84
+ measures.bin [1L , ] = list (" accuracy" , mlr.measures [[1L ]][" acc" ],
85
+ oml.measures [[1L ]][11L , 2L ])
86
+ measures.bin [2L , ] = c(" auc" , mlr.measures [[1L ]][" auc" ],
87
+ oml.measures [[1L ]][1L , 2L ])
88
+ measures.bin [3L , ] = c(" recall" , mlr.measures [[1L ]][" tpr" ],
89
+ oml.measures [[1L ]][13L , 2L ])
90
+ measures.bin [4L , ] = c(" fmeasure" , mlr.measures [[1L ]][" f1" ],
91
+ oml.measures [[1L ]][4L , 2L ])
92
+ measures.bin [5L , ] = c(" precision" , mlr.measures [[1L ]][" ppv" ],
93
+ oml.measures [[1L ]][10L , 2L ])
94
+ # regr
95
+ measures.regr = data.frame (measure = character (2L ), mlr.val = numeric (2L ),
96
+ oml.val = character (2L ), stringsAsFactors = FALSE )
97
+ measures.regr [1L , ] = list (" mae" , mlr.measures [[3L ]][" mae" ],
98
+ oml.measures [[3L ]][1L , 2L ])
99
+ measures.regr [2L , ] = c(" rmse" , mlr.measures [[3L ]][" rmse" ],
100
+ oml.measures [[3L ]][5L , 2L ])
101
+ # ok, see what we got here
102
+ measures.bin
103
+ measures.regr
104
+
105
+ # Firstly:
106
+ # In OpenML measures for a specific class are calculated for each class
107
+ # and then averaged with classweights
108
+ # For example: Reproduce recall in measures.bin:
109
+ conf.mat = getConfMatrix(cv.res [[1L ]]$ pred )
110
+ class.weights = c(conf.mat [1L , 1L ] + conf.mat [1L , 2L ],
111
+ conf.mat [2L , 1L ] + conf.mat [2L , 2L ])
112
+ recall1 = conf.mat [1L , 1L ] / class.weights [1L ]
113
+ recall2 = conf.mat [2L , 2L ] / class.weights [2L ]
114
+ mean.weighted.recall = weighted.mean(c(recall1 , recall2 ), w = class.weights )
115
+ mean.weighted.recall
0 commit comments