15
15
16
16
package com .amazon .randomcutforest ;
17
17
18
- import com .amazon .randomcutforest .testutils .ShingledMultiDimDataWithKeys ;
18
+ import java .util .Arrays ;
19
+ import java .util .concurrent .ForkJoinPool ;
20
+
19
21
import org .junit .jupiter .api .Tag ;
20
22
import org .junit .jupiter .api .Test ;
21
23
22
- import java .util .Arrays ;
23
- import java .util .concurrent .ForkJoinPool ;
24
+ import com .amazon .randomcutforest .testutils .ShingledMultiDimDataWithKeys ;
24
25
25
26
/**
26
- * The following "test" is intended to provide an approximate estimate of the improvement
27
- * from parallelization. At the outset, we remark that running the test from inside
28
- * an IDE/environment may reflect more of the environment. Issues such as warming are not
29
- * reflected in this test.
27
+ * The following "test" is intended to provide an approximate estimate of the
28
+ * improvement from parallelization. At the outset, we remark that running the
29
+ * test from inside an IDE/environment may reflect more of the environment.
30
+ * Issues such as warming are not reflected in this test.
30
31
*
31
- * Users who wish to obtain more calibrated estimates should use a benchmark -- preferably
32
- * using their own "typical" data and their end to end setup. Performance of RCF is data dependent.
33
- * Such users may be invoking RCF functions differently from a standard "impute, score, update"
34
- * process recommended for streaming time series data.
32
+ * Users who wish to obtain more calibrated estimates should use a benchmark --
33
+ * preferably using their own "typical" data and their end to end setup.
34
+ * Performance of RCF is data dependent. Such users may be invoking RCF
35
+ * functions differently from a standard "impute, score, update" process
36
+ * recommended for streaming time series data.
35
37
*
36
- * Moreover, in the context of a large number of models, the rate at which the models require
37
- * updates is also a factor and not controlled herein.
38
+ * Moreover, in the context of a large number of models, the rate at which the
39
+ * models require updates is also a factor and not controlled herein.
38
40
*
39
- * The two tests should produce near identical sum of scores, and (root) mean squared error of
40
- * the impute up to machine precision (since the order of the arithmetic operations would vary).
41
+ * The two tests should produce near identical sum of scores, and (root) mean
42
+ * squared error of the impute up to machine precision (since the order of the
43
+ * arithmetic operations would vary).
41
44
*
42
- * To summarize the lessons, it appears that parallelism almost always helps (upto resource limitations).
43
- * If an user is considering a single model -- say from a console or dashboard, they should consider
44
- * having parallel threads enabled. For large number of models, it may be worthwhile
45
- * to also investigate different ways of achieving parallelism and not just attempt to
46
- * change the executor framework.
45
+ * To summarize the lessons, it appears that parallelism almost always helps
46
+ * (upto resource limitations). If an user is considering a single model -- say
47
+ * from a console or dashboard, they should consider having parallel threads
48
+ * enabled. For large number of models, it may be worthwhile to also investigate
49
+ * different ways of achieving parallelism and not just attempt to change the
50
+ * executor framework.
47
51
*
48
52
*/
49
53
@@ -65,23 +69,23 @@ public class CPUTest {
65
69
66
70
@ Test
67
71
public void profileTestSync () {
68
- double [] mse = new double [numberOfForests ];
69
- int [] mseCount = new int [numberOfForests ];
70
- double [] score =new double [numberOfForests ];
71
-
72
- double [][] data = ShingledMultiDimDataWithKeys .getMultiDimData (DATA_SIZE , 60 , 100 , 5 , 0 , numberOfAttributes ). data ;
73
-
74
- RandomCutForest [] forests = new RandomCutForest [ numberOfForests ];
75
- for ( int k = 0 ; k < numberOfForests ; k ++) {
76
- forests [ k ] = RandomCutForest . builder (). numberOfTrees ( numberOfTrees ). dimensions ( dimensions ). shingleSize ( shingleSize )
77
- . boundingBoxCacheFraction ( boundingBoxCacheFraction ). randomSeed ( 99 + k ). outputAfter ( 10 )
78
- .parallelExecutionEnabled ( true )
79
- .threadPoolSize (numberOfThreads )
72
+ double [] mse = new double [numberOfForests ];
73
+ int [] mseCount = new int [numberOfForests ];
74
+ double [] score = new double [numberOfForests ];
75
+
76
+ double [][] data = ShingledMultiDimDataWithKeys .getMultiDimData (DATA_SIZE , 60 , 100 , 5 , 0 ,
77
+ numberOfAttributes ). data ;
78
+
79
+ RandomCutForest [] forests = new RandomCutForest [ numberOfForests ];
80
+ for ( int k = 0 ; k < numberOfForests ; k ++) {
81
+ forests [ k ] = RandomCutForest . builder (). numberOfTrees ( numberOfTrees ). dimensions ( dimensions )
82
+ .shingleSize ( shingleSize ). boundingBoxCacheFraction ( boundingBoxCacheFraction ). randomSeed ( 99 + k )
83
+ .outputAfter ( 10 ). parallelExecutionEnabled ( true ). threadPoolSize (numberOfThreads )
80
84
.internalShinglingEnabled (true ).initialAcceptFraction (0.1 ).sampleSize (sampleSize ).build ();
81
85
}
82
86
83
87
for (int j = 0 ; j < data .length ; j ++) {
84
- for (int k = 0 ;k < numberOfForests ; k ++) {
88
+ for (int k = 0 ; k < numberOfForests ; k ++) {
85
89
score [k ] += forests [k ].getAnomalyScore (data [j ]);
86
90
if (j % 10 == 0 && j > 0 ) {
87
91
double [] result = forests [k ].extrapolate (1 );
@@ -97,7 +101,7 @@ public void profileTestSync() {
97
101
forests [k ].update (data [j ]);
98
102
}
99
103
}
100
- for (int k = 0 ; k < numberOfForests ;k ++) {
104
+ for (int k = 0 ; k < numberOfForests ; k ++) {
101
105
System .out .println (" Forest " + k );
102
106
System .out .println (" MSE " + mse [k ] / mseCount [k ]);
103
107
System .out .println (" scoresum " + score [k ] / data .length );
@@ -106,30 +110,30 @@ public void profileTestSync() {
106
110
107
111
@ Test
108
112
public void profileTestASync () {
109
- double [] mse = new double [numberOfForests ];
110
- int [] mseCount = new int [numberOfForests ];
111
- double [] score =new double [numberOfForests ];
112
-
113
- double [][] data = ShingledMultiDimDataWithKeys .getMultiDimData (DATA_SIZE , 60 , 100 , 5 , 0 , numberOfAttributes ).data ;
114
-
115
- RandomCutForest [] forests = new RandomCutForest [numberOfForests ];
116
- for (int k = 0 ;k <numberOfForests ; k ++) {
117
- forests [k ] = RandomCutForest .builder ().numberOfTrees (numberOfTrees ).dimensions (dimensions ).shingleSize (shingleSize )
118
- .boundingBoxCacheFraction (boundingBoxCacheFraction ).randomSeed (99 +k ).outputAfter (10 )
119
- .parallelExecutionEnabled (false )
120
- .internalShinglingEnabled (true ).initialAcceptFraction (0.1 ).sampleSize (sampleSize ).build ();
113
+ double [] mse = new double [numberOfForests ];
114
+ int [] mseCount = new int [numberOfForests ];
115
+ double [] score = new double [numberOfForests ];
116
+
117
+ double [][] data = ShingledMultiDimDataWithKeys .getMultiDimData (DATA_SIZE , 60 , 100 , 5 , 0 ,
118
+ numberOfAttributes ).data ;
119
+
120
+ RandomCutForest [] forests = new RandomCutForest [numberOfForests ];
121
+ for (int k = 0 ; k < numberOfForests ; k ++) {
122
+ forests [k ] = RandomCutForest .builder ().numberOfTrees (numberOfTrees ).dimensions (dimensions )
123
+ .shingleSize (shingleSize ).boundingBoxCacheFraction (boundingBoxCacheFraction ).randomSeed (99 + k )
124
+ .outputAfter (10 ).parallelExecutionEnabled (false ).internalShinglingEnabled (true )
125
+ .initialAcceptFraction (0.1 ).sampleSize (sampleSize ).build ();
121
126
}
122
127
123
128
ForkJoinPool forkJoinPool = new ForkJoinPool (numberOfThreads );
124
- int [] indices = new int [numberOfForests ];
125
- for (int k = 0 ; k < numberOfForests ;k ++){
129
+ int [] indices = new int [numberOfForests ];
130
+ for (int k = 0 ; k < numberOfForests ; k ++) {
126
131
indices [k ] = k ;
127
132
}
128
133
129
134
for (int j = 0 ; j < data .length ; j ++) {
130
- int finalJ =j ;
131
- forkJoinPool .submit ( () ->
132
- Arrays .stream (indices ).parallel ().forEach (k -> {
135
+ int finalJ = j ;
136
+ forkJoinPool .submit (() -> Arrays .stream (indices ).parallel ().forEach (k -> {
133
137
score [k ] += forests [k ].getAnomalyScore (data [finalJ ]);
134
138
if (finalJ % 10 == 0 && finalJ > 0 ) {
135
139
double [] result = forests [k ].extrapolate (1 );
@@ -145,7 +149,7 @@ public void profileTestASync() {
145
149
forests [k ].update (data [finalJ ]);
146
150
})).join ();
147
151
}
148
- for (int k = 0 ; k < numberOfForests ;k ++) {
152
+ for (int k = 0 ; k < numberOfForests ; k ++) {
149
153
System .out .println (" Forest " + k );
150
154
System .out .println (" MSE " + mse [k ] / mseCount [k ]);
151
155
System .out .println (" scoresum " + score [k ] / data .length );
0 commit comments