-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path999_run_randomforest_experiment.R
69 lines (45 loc) · 2.6 KB
/
999_run_randomforest_experiment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
adjust_R2 = function(R2, n, k){
#Function to convert R2 to adjusted R2
#R2: R-squared accuracy of model
#n: Number of samples in training set
#k: Number of predictors in model
adjusted_R2 = 1 - (1 - R2)*(n - 1)/(n - k - 1)
return(adjusted_R2)
}
RF_CV = function(input, output, CV, train_fraction, n_tree){
print(CV)
#Takes in and input data.table/dataframe and runs a random forest experiment
#Convert input and output to data.table
input = as.data.table(input)
output = as.numeric(output)
#Split data into training and testing
set.seed(CV)
train_rows = sample(nrow(input), round(nrow(input)*train_fraction))
test_rows = setdiff(seq(nrow(input)), train_rows)
#Train randomForest
rf = randomForest(x = input[train_rows], y = output[train_rows], keep.forest = T,
xtest = input[test_rows], ytest = output[test_rows], ntree = n_tree,
do.trace = F, importance = T)
#Get test predictions
test_R2 = (cor(rf$test$predicted, output[test_rows]))^2
#Get adjusted R2
adj_R2 = adjust_R2(test_R2, nrow(input), ncol(input))
return(c(adj_R2, importance(rf, type = 1)))
}
run_RF_and_get_pred_importances = function(input, output, n_cv, n_tree=250, cores=8, train_split=0.8){
#Runs a random forest regression experiment using Monte Carlo cross-validation n_cv times. Returns data table for R2 and predictor importances.
#input: dataframe or table of input/predictors. Rows must correspond to observations and columns must correspond to predictors.
#Output: Numeric vector of response/true model output, with length equal to nrows of input.
#n_cv: Number of Monte-Carlo cross-validations to be performed.
#cores= Number of CPU cores to use for parallelization.
#train_split= Fraction to split data into training and testing sets.
#Returns data table where each row is a cross-validation run, the first column is the Test_R2 for that run,
#and the following columns are predictor importances for that run.
#Run random forest for each CV and make into dataframe where each row is a single CV run (shows progress bar)
RF_results = as.data.table(do.call("rbind.data.frame",
mclapply(seq(n_cv),
function(i) RF_CV(input, output, i, train_split, n_tree),
mc.cores = cores)))
colnames(RF_results) = c("Test_adj_R2", colnames(input))
return(RF_results)
}