Skip to content

Commit 068d09d

Browse files
authored
ungrading thresholding and fixing RCFCaster errors for initial values (#385)
* ungrading thresholding and fixing RCFCaster errors for initial values * optimizations * tweaks and comments * changes * cleanup and optimizations * fixes based on comments * fixes and changes * off by one correction
1 parent bd3b056 commit 068d09d

29 files changed

+647
-546
lines changed

Java/core/src/main/java/com/amazon/randomcutforest/state/Version.java

+1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ public class Version {
2020
public static final String V2_1 = "2.1";
2121
public static final String V3_0 = "3.0";
2222
public static final String V3_5 = "3.5";
23+
public static final String V3_7 = "3.7";
2324
}

Java/examples/src/main/java/com/amazon/randomcutforest/examples/parkservices/LowNoisePeriodic.java

+24-15
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ public String description() {
4444
public void run() throws Exception {
4545
// Create and populate a random cut forest
4646

47-
int shingleSize = 4;
47+
int shingleSize = 8;
4848
int numberOfTrees = 50;
4949
int sampleSize = 256;
5050
int dataSize = 100000;
@@ -53,17 +53,21 @@ public void run() throws Exception {
5353
double[] reference = new double[] { 1.0f, 3.0f, 5.0f, 7.0f, 9.0f, 11.0f, 9.5f, 8.5f, 7.5f, 6.5f, 6.0f, 6.5f,
5454
7.0f, 7.5f, 9.5f, 11.0f, 12.5f, 10.5f, 8.5f, 7.0f, 5.0f, 3.0f, 2.0f, 1.0f };
5555

56-
// change this to control the percent deviation
57-
// NOTE that if the noise is smaller than 0.003 times the actual value then
58-
// it would be difficult to detect the anomalies unless the slope is 0
59-
60-
double noise = 2.0;
56+
// the noise should leave suffient gap between the consecutive levels
57+
double noise = 0.25;
58+
// the noise will be amplified by something within [factorRange, 2*factorRange]
59+
// increase should lead to increased precision--recall; likewise decrease must
60+
// also
61+
// lead to decreased precision recall; if the factor = 1, then the anomalies are
62+
// information theoretically almost non-existent
63+
double anomalyFactor = 10;
6164

6265
double slope = 0.2 * sampleSize
6366
* (Arrays.stream(reference).max().getAsDouble() - Arrays.stream(reference).min().getAsDouble()) / 50000;
6467

65-
// to analyse without linear shift
66-
// slope = 0;
68+
// to analyse without linear shift; comment out the line below and change the
69+
// slope above as desired
70+
slope = 0;
6771

6872
double anomalyRate = 0.005;
6973
long seed = new Random().nextLong();
@@ -74,12 +78,14 @@ public void run() throws Exception {
7478
int correct = 0;
7579
int late = 0;
7680

77-
// change the transformation below to experiment
81+
// change the transformation below to experiment;
82+
// if slope != 0 then NONE will have poor result
83+
// both of the difference operations also introduce many errors
7884
TransformMethod method = TransformMethod.NORMALIZE;
7985

8086
int dimensions = shingleSize;
81-
ThresholdedRandomCutForest forest = ThresholdedRandomCutForest.builder().compact(true).dimensions(dimensions)
82-
.randomSeed(0).numberOfTrees(numberOfTrees).shingleSize(shingleSize).sampleSize(sampleSize)
87+
ThresholdedRandomCutForest forest = ThresholdedRandomCutForest.builder().dimensions(dimensions).randomSeed(0)
88+
.numberOfTrees(numberOfTrees).shingleSize(shingleSize).sampleSize(sampleSize)
8389
.internalShinglingEnabled(true).anomalyRate(0.01).forestMode(ForestMode.STANDARD).startNormalization(32)
8490
.transformMethod(method).outputAfter(32).initialAcceptFraction(0.125)
8591
// for 1D data weights should not alter results significantly (if in reasonable
@@ -99,7 +105,7 @@ public void run() throws Exception {
99105
// missed current value 3.0 (say X), intended 1.0 (equiv., X - noise), because
100106
// the shift up in the actual was not 2*noise
101107

102-
// forest.setIgnoreNearExpectedFromAbove( new double [] {2 * noise});
108+
// forest.setIgnoreNearExpectedFromAbove( new double [] {2*noise});
103109

104110
// or to suppress all anomalies that are shifted up from predicted
105111
// for any sequence; using Double.MAX_VALUE may cause overflow
@@ -110,7 +116,7 @@ public void run() throws Exception {
110116
// the shift down in the actual was not 2*noise, in effect we suppress all
111117
// anomalies
112118

113-
// forest.setIgnoreNearExpectedFromBelow(new double [] {2*noise});
119+
// forest.setIgnoreNearExpectedFromBelow(new double [] {noise*2});
114120

115121
// the following suppresses all anomalies that shifted down compared to
116122
// predicted
@@ -126,12 +132,15 @@ public void run() throws Exception {
126132
boolean anomaly = false;
127133

128134
double intendedValue = reference[(count + 4) % reference.length] + slope * count;
129-
// extremely periodic signal
135+
// extremely periodic signal -- note that there is no periodicity detection
130136
value[0] = intendedValue;
131137
if (rng.nextDouble() < anomalyRate && count > initialSegment) {
132-
value[0] += (rng.nextDouble() < 0.5) ? -noise : noise;
138+
double anomalyValue = noise * anomalyFactor * (1 + rng.nextDouble());
139+
value[0] += (rng.nextDouble() < 0.5) ? -anomalyValue : anomalyValue;
133140
anomaly = true;
134141
++numAnomalies;
142+
} else {
143+
value[0] += (2 * rng.nextDouble() - 1) * noise;
135144
}
136145

137146
AnomalyDescriptor result = forest.process(new double[] { value[0] }, 0);

Java/examples/src/main/java/com/amazon/randomcutforest/examples/parkservices/ThresholdedMultiDimensionalExample.java

+21-7
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ public String description() {
4747
public void run() throws Exception {
4848
// Create and populate a random cut forest
4949

50-
int shingleSize = 4;
50+
int shingleSize = 8;
5151
int numberOfTrees = 50;
5252
int sampleSize = 256;
5353
Precision precision = Precision.FLOAT_32;
5454
int dataSize = 4 * sampleSize;
5555

5656
// change this to try different number of attributes,
5757
// this parameter is not expected to be larger than 5 for this example
58-
int baseDimensions = 2;
58+
int baseDimensions = 3;
5959

6060
int dimensions = baseDimensions * shingleSize;
6161
ThresholdedRandomCutForest forest = ThresholdedRandomCutForest.builder().compact(true).dimensions(dimensions)
@@ -66,19 +66,33 @@ public void run() throws Exception {
6666

6767
long seed = new Random().nextLong();
6868
System.out.println("seed = " + seed);
69+
70+
// basic amplitude of the waves -- the parameter will be randomly scaled up
71+
// betwee 0-20 percent
72+
double amplitude = 100.0;
73+
74+
// the amplitude of random noise it will be +ve/-ve uniformly at random
75+
double noise = 5.0;
76+
77+
// the following controls the ratio of anomaly magnitude to noise
78+
// notice amplitude/noise would determine signal-to-noise ratio
79+
double anomalyFactor = 5;
80+
81+
// the following determines if a random linear trend should be added
82+
boolean useSlope = false;
83+
6984
// change the last argument seed for a different run
7085
MultiDimDataWithKey dataWithKeys = ShingledMultiDimDataWithKeys.getMultiDimData(dataSize + shingleSize - 1, 50,
71-
100, 5, seed, baseDimensions);
86+
amplitude, noise, seed, baseDimensions, anomalyFactor, useSlope);
7287
int keyCounter = 0;
7388
int count = 0;
7489
for (double[] point : dataWithKeys.data) {
7590

7691
AnomalyDescriptor result = forest.process(point, 0L);
7792

78-
if (keyCounter < dataWithKeys.changeIndices.length
79-
&& count + shingleSize - 1 == dataWithKeys.changeIndices[keyCounter]) {
80-
System.out.println("timestamp " + (count + shingleSize - 1) + " CHANGE "
81-
+ Arrays.toString(dataWithKeys.changes[keyCounter]));
93+
if (keyCounter < dataWithKeys.changeIndices.length && count == dataWithKeys.changeIndices[keyCounter]) {
94+
System.out.println(
95+
"timestamp " + (count) + " CHANGE " + Arrays.toString(dataWithKeys.changes[keyCounter]));
8296
++keyCounter;
8397
}
8498

Java/parkservices/src/main/java/com/amazon/randomcutforest/parkservices/ErrorHandler.java

+23-15
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ public ErrorHandler(int errorHorizon, int forecastHorizon, int sequenceIndex, do
148148
pastForecasts[i] = new RangeVector(values, upper, lower);
149149
System.arraycopy(actualsFlattened, i * inputLength, actuals[i], 0, inputLength);
150150
}
151-
calibrate();
151+
calibrate(null);
152152
}
153153
}
154154

@@ -171,7 +171,7 @@ public void update(ForecastDescriptor descriptor, Calibration calibrationMethod)
171171
actuals[errorIndex][i] = (float) input[i];
172172
}
173173
++sequenceIndex;
174-
calibrate();
174+
calibrate(descriptor.deviations);
175175
if (calibrationMethod != Calibration.NONE) {
176176
if (calibrationMethod == Calibration.SIMPLE) {
177177
adjust(descriptor.timedForecast.rangeVector, errorDistribution);
@@ -260,8 +260,8 @@ public RangeVector computeErrorPercentile(double percentile, int newHorizon,
260260
double fracRank = percentile * len;
261261
Arrays.sort(copy);
262262
values[pos] = interpolatedMedian(copy);
263-
lower[pos] = interpolatedLowerRank(copy, fracRank);
264-
upper[pos] = interpolatedUpperRank(copy, len, fracRank);
263+
lower[pos] = interpolatedLowerRank(copy, fracRank, 0);
264+
upper[pos] = interpolatedUpperRank(copy, len, fracRank, 0);
265265
}
266266
}
267267
}
@@ -286,8 +286,11 @@ protected double[] getErrorVector(int len, int leadtime, int inputCoordinate, in
286286
* this method computes a lot of different quantities, some of which would be
287287
* useful in the future.In particular it splits the RMSE into positive and
288288
* negative contribution which is informative about directionality of error.
289+
*
290+
*
291+
* @param errorDeviations the weighted standard deviations seen so far
289292
*/
290-
protected void calibrate() {
293+
protected void calibrate(double[] errorDeviations) {
291294
int inputLength = actuals[0].length;
292295
int arrayLength = pastForecasts.length;
293296
int errorIndex = (sequenceIndex - 1 + arrayLength) % arrayLength;
@@ -327,16 +330,19 @@ protected void calibrate() {
327330
errorRMSE.high[pos] = (positiveCount > 0) ? Math.sqrt(positiveSqSum / positiveCount) : 0;
328331
errorRMSE.low[pos] = (positiveCount < len) ? -Math.sqrt(negativeSqSum / (len - positiveCount)) : 0;
329332
Arrays.sort(medianError, 0, len);
333+
// medianError array is now sorted
330334
errorDistribution.values[pos] = interpolatedMedian(medianError);
331-
errorDistribution.upper[pos] = interpolatedUpperRank(medianError, len, len * percentile);
332-
errorDistribution.lower[pos] = interpolatedLowerRank(medianError, len * percentile);
335+
double deviation = (errorDeviations == null) ? 0 : errorDeviations[j];
336+
errorDistribution.upper[pos] = interpolatedUpperRank(medianError, len, len * percentile, deviation);
337+
errorDistribution.lower[pos] = interpolatedLowerRank(medianError, len * percentile, deviation);
333338
intervalPrecision[pos] = intervalPrecision[pos] / len;
334339
} else {
335340
errorMean[pos] = 0;
336341
errorRMSE.high[pos] = errorRMSE.low[pos] = 0;
337342
errorDistribution.values[pos] = 0;
338-
errorDistribution.upper[pos] = Float.MAX_VALUE;
339-
errorDistribution.lower[pos] = -Float.MAX_VALUE;
343+
double deviation = (errorDeviations == null) ? 0 : errorDeviations[j];
344+
errorDistribution.upper[pos] = (float) (1.3 * deviation);
345+
errorDistribution.lower[pos] = -(float) (1.3 * deviation);
340346
adders.upper[pos] = adders.lower[pos] = adders.values[pos] = 0;
341347
intervalPrecision[pos] = 0;
342348
}
@@ -354,28 +360,30 @@ float interpolatedMedian(double[] array) {
354360
}
355361
}
356362

357-
float interpolatedLowerRank(double[] array, double fracRank) {
363+
float interpolatedLowerRank(double[] ascendingArray, double fracRank, double deviation) {
358364
if (fracRank < 1) {
359-
return -Float.MAX_VALUE;
365+
return (float) (-1.3 * deviation * (1 - fracRank) + fracRank * ascendingArray[0]);
360366
}
361367
int rank = (int) Math.floor(fracRank);
362368
if (!RCFCaster.USE_INTERPOLATION_IN_DISTRIBUTION) {
363369
// turn off interpolation
364370
fracRank = rank;
365371
}
366-
return (float) (array[rank - 1] + (fracRank - rank) * (array[rank] - array[rank - 1]));
372+
return (float) (ascendingArray[rank - 1]
373+
+ (fracRank - rank) * (ascendingArray[rank] - ascendingArray[rank - 1]));
367374
}
368375

369-
float interpolatedUpperRank(double[] array, int len, double fracRank) {
376+
float interpolatedUpperRank(double[] ascendingArray, int len, double fracRank, double deviation) {
370377
if (fracRank < 1) {
371-
return Float.MAX_VALUE;
378+
return (float) (1.3 * deviation * (1 - fracRank) + fracRank * ascendingArray[len - 1]);
372379
}
373380
int rank = (int) Math.floor(fracRank);
374381
if (!RCFCaster.USE_INTERPOLATION_IN_DISTRIBUTION) {
375382
// turn off interpolation
376383
fracRank = rank;
377384
}
378-
return (float) (array[len - rank] + (fracRank - rank) * (array[len - rank - 1] - array[len - rank]));
385+
return (float) (ascendingArray[len - rank]
386+
+ (fracRank - rank) * (ascendingArray[len - rank - 1] - ascendingArray[len - rank]));
379387
}
380388

381389
void adjust(RangeVector rangeVector, RangeVector other) {

0 commit comments

Comments
 (0)