Skip to content

Commit b39f02d

Browse files
committed
Merge pull request #575 from florianfendt/openmlcomp
#372 openml mlr measures
2 parents 4b9b147 + a1cb95d commit b39f02d

File tree

1 file changed

+115
-0
lines changed

1 file changed

+115
-0
lines changed

todo-files/oml_mlr_measures.R

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
library(mlr)
2+
library(OpenML)
3+
#get the tasks for the experiment
4+
tl = listOMLTasks()
5+
# filter data sets and get appropriate data set IDs:
6+
# classification
7+
class.tasks = subset(tl, task_type == "Supervised Classification" & NumberOfFeatures < 15 &
8+
NumberOfFeatures > 3 & NumberOfInstances < 100 & NumberOfMissingValues == 0)
9+
class.tasks$estimation_procedure
10+
#binaryclass
11+
bin.ids = subset(class.tasks, NumberOfClasses == 2L)$task_id
12+
#multiclass
13+
multiclass.ids = subset(class.tasks, NumberOfClasses > 2L & NumberOfClasses < 5L)$task_id
14+
#regression
15+
regr.ids = subset(tl, task_type == "Supervised Regression" &
16+
NumberOfFeatures < 15 & NumberOfFeatures > 3 & NumberOfInstances < 100 &
17+
NumberOfMissingValues == 0)$task_id
18+
#a list of task indices for each type
19+
ids = list(bin = bin.ids, multiclass = multiclass.ids, regr = regr.ids)
20+
#draw from each task type
21+
cases = 1L
22+
set.seed(getOption("mlr.debug.seed"))
23+
ids = as.vector(lapply(ids, sample, size = cases))
24+
#get the tasks
25+
tsks = lapply(ids, getOMLTask)
26+
#convert them to mlr and get the task desc and the resample instances
27+
tsks.mlr.info = sapply(tsks, convertOMLTaskToMlr, simplify = FALSE)
28+
tsks.mlr = sapply(tsks.mlr.info, "[[", "mlr.task", simplify = FALSE)
29+
rins = sapply(tsks.mlr.info, "[[", "mlr.rin", simplify = FALSE)
30+
31+
##create the learners
32+
lrns = list(bin = list(cl = "classif.rpart", predict.type = "prob"),
33+
multiclass = list("classif.rpart", predict.type = "prob"),
34+
regr = list("regr.rpart"))
35+
lrns = lapply(lrns, do.call, what = "makeLearner")
36+
##upload learner and get implementation id
37+
impl.ids = sapply(lrns, uploadOMLFlow, simplify = FALSE)
38+
##upload and get OML run
39+
set.seed(getOption("mlr.debug.seed"))
40+
#run the tasks
41+
ran.tsks = mapply(runTaskMlr, tsks, lrns, SIMPLIFY = FALSE)
42+
run.ids = mapply(uploadOMLRun, ran.tsks, impl.ids)
43+
save.image("datasetup.RData")
44+
#########################################################################
45+
#wait a bit before executing further, it takes some time
46+
#for the measures to be available
47+
runs = sapply(run.ids, getOMLRun, simplify = FALSE, verbosity = 0L)
48+
#get all the calculated measures and store in list:
49+
oml.measures = list()
50+
for ( i in 1:length(runs)) {
51+
oml.measures[[i]] <- runs[[i]]$output.data$evaluations[, c("name", "value")]
52+
}
53+
#we need lists for the mlr-measures we want
54+
cv.measures = list(bin = list(acc, auc, tpr, f1, ppv),
55+
multiclass = list(acc, ber),
56+
regr = list(mae, rmse))
57+
#lets resample
58+
cv.res = mapply(resample, lrns, tsks.mlr, rins, cv.measures,
59+
show.info = FALSE, SIMPLIFY = FALSE)
60+
#We need the weighted mean, mlr$aggr is averaged over folds regardless of size.
61+
#Bernd, what's the name of the function, that should be doing that for me?
62+
#I wasn't able to find it
63+
#So I need the number of obs in each fold
64+
fold.sizes = lapply(rins, "[[", "test.inds")
65+
fold.sizes = lapply(fold.sizes, FUN = function(x) {
66+
lapply(x, length)
67+
})
68+
fold.sizes = lapply(fold.sizes, unlist)
69+
#then calculate the measures
70+
mlr.measures = sapply(cv.res, "[[", "measures.test")
71+
mlr.measures = mapply(FUN = function(x, y) {
72+
x$fold.sizes = y
73+
x[, 1L] = NULL
74+
x
75+
}, mlr.measures, fold.sizes)
76+
mlr.measures = lapply(mlr.measures, FUN = function(x) {
77+
apply(x, 2L, weighted.mean, w = x$fold.sizes)
78+
})
79+
mlr.measures = lapply(mlr.measures, round, digits = 6L)
80+
#ok now lets put this together to compare stuff:
81+
#binary
82+
measures.bin = data.frame(measure = character(5L), mlr.val = numeric(5L),
83+
oml.val = character(5L), stringsAsFactors = FALSE)
84+
measures.bin[1L, ] = list("accuracy", mlr.measures[[1L]]["acc"],
85+
oml.measures[[1L]][11L, 2L])
86+
measures.bin[2L, ] = c("auc", mlr.measures[[1L]]["auc"],
87+
oml.measures[[1L]][1L, 2L])
88+
measures.bin[3L, ] = c("recall", mlr.measures[[1L]]["tpr"],
89+
oml.measures[[1L]][13L, 2L])
90+
measures.bin[4L, ] = c("fmeasure", mlr.measures[[1L]]["f1"],
91+
oml.measures[[1L]][4L, 2L])
92+
measures.bin[5L, ] = c("precision", mlr.measures[[1L]]["ppv"],
93+
oml.measures[[1L]][10L, 2L])
94+
#regr
95+
measures.regr = data.frame(measure = character(2L), mlr.val = numeric(2L),
96+
oml.val = character(2L), stringsAsFactors = FALSE)
97+
measures.regr[1L, ] = list("mae", mlr.measures[[3L]]["mae"],
98+
oml.measures[[3L]][1L, 2L])
99+
measures.regr[2L, ] = c("rmse", mlr.measures[[3L]]["rmse"],
100+
oml.measures[[3L]][5L, 2L])
101+
#ok, see what we got here
102+
measures.bin
103+
measures.regr
104+
105+
#Firstly:
106+
#In OpenML measures for a specific class are calculated for each class
107+
#and then averaged with classweights
108+
#For example: Reproduce recall in measures.bin:
109+
conf.mat = getConfMatrix(cv.res[[1L]]$pred)
110+
class.weights = c(conf.mat[1L, 1L] + conf.mat[1L, 2L],
111+
conf.mat[2L, 1L] + conf.mat[2L, 2L])
112+
recall1 = conf.mat[1L, 1L] / class.weights[1L]
113+
recall2 = conf.mat[2L, 2L] / class.weights[2L]
114+
mean.weighted.recall = weighted.mean(c(recall1, recall2), w = class.weights)
115+
mean.weighted.recall

0 commit comments

Comments
 (0)