Skip to content

Commit 9fbf32c

Browse files
committed
state machine, separate config index, suggest & validate
Also, save cold start results for run once visualization. Signed-off-by: Kaituo Li <kaituo@amazon.com>
1 parent 926b2e1 commit 9fbf32c

File tree

175 files changed

+2370
-989
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

175 files changed

+2370
-989
lines changed

build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ dependencies {
164164
testImplementation "org.opensearch:opensearch-core:${opensearch_version}"
165165
testRuntimeOnly("org.junit.platform:junit-platform-launcher:1.11.2")
166166
testCompileOnly 'junit:junit:4.13.2'
167+
testImplementation 'org.reflections:reflections:0.10.2'
167168
}
168169

169170
apply plugin: 'java'

src/main/java/org/opensearch/ad/ADEntityProfileRunner.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ public ADEntityProfileRunner(
4040
AnalysisType.AD,
4141
ADEntityProfileAction.INSTANCE,
4242
ADCommonName.ANOMALY_RESULT_INDEX_ALIAS,
43-
AnomalyResult.DETECTOR_ID_FIELD
43+
AnomalyResult.DETECTOR_ID_FIELD,
44+
ADCommonName.CONFIG_INDEX
4445
);
4546
}
4647
}

src/main/java/org/opensearch/ad/AnomalyDetectorProfileRunner.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import org.apache.logging.log4j.LogManager;
1515
import org.apache.logging.log4j.Logger;
16+
import org.opensearch.ad.constant.ADCommonName;
1617
import org.opensearch.ad.indices.ADIndex;
1718
import org.opensearch.ad.indices.ADIndexManagement;
1819
import org.opensearch.ad.model.ADTask;
@@ -71,7 +72,8 @@ public AnomalyDetectorProfileRunner(
7172
ProfileName.AD_TASK,
7273
ADProfileAction.INSTANCE,
7374
AnomalyDetector::parse,
74-
taskProfileRunner
75+
taskProfileRunner,
76+
ADCommonName.CONFIG_INDEX
7577
);
7678
}
7779

src/main/java/org/opensearch/ad/ExecuteADResultResponseRecorder.java

+8-5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
package org.opensearch.ad;
1313

14+
import java.time.Clock;
1415
import java.time.Instant;
1516
import java.util.ArrayList;
1617
import java.util.Optional;
@@ -50,7 +51,7 @@ public ExecuteADResultResponseRecorder(
5051
ThreadPool threadPool,
5152
Client client,
5253
NodeStateManager nodeStateManager,
53-
ADTaskCacheManager taskCacheManager,
54+
Clock clock,
5455
int rcfMinSamples
5556
) {
5657
super(
@@ -62,7 +63,7 @@ public ExecuteADResultResponseRecorder(
6263
TimeSeriesAnalyticsPlugin.AD_THREAD_POOL_NAME,
6364
client,
6465
nodeStateManager,
65-
taskCacheManager,
66+
clock,
6667
rcfMinSamples,
6768
ADIndex.RESULT,
6869
AnalysisType.AD,
@@ -104,14 +105,15 @@ protected AnomalyResult createErrorResult(
104105
* Instead, we issue a profile request to poll each model node and get the maximum total updates among all models.
105106
* @param response response returned from executing AnomalyResultAction
106107
* @param configId config Id
108+
* @param clock Clock to get current time
107109
*/
108110
@Override
109-
protected void updateRealtimeTask(ResultResponse<AnomalyResult> response, String configId) {
111+
protected void updateRealtimeTask(ResultResponse<AnomalyResult> response, String configId, Clock clock) {
110112
if (response.isHC() != null && response.isHC()) {
111113
if (taskManager.skipUpdateRealtimeTask(configId, response.getError())) {
112114
return;
113115
}
114-
delayedUpdate(response, configId);
116+
delayedUpdate(response, configId, clock);
115117
} else {
116118
log
117119
.debug(
@@ -124,7 +126,8 @@ protected void updateRealtimeTask(ResultResponse<AnomalyResult> response, String
124126
null,
125127
response.getRcfTotalUpdates(),
126128
response.getConfigIntervalInMinutes(),
127-
response.getError()
129+
response.getError(),
130+
clock
128131
);
129132
}
130133
}

src/main/java/org/opensearch/ad/constant/ADCommonName.java

+3
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@ public class ADCommonName {
1919
public static final String CHECKPOINT_INDEX_NAME = ".opendistro-anomaly-checkpoints";
2020
// index name for anomaly detection state. Will store AD task in this index as well.
2121
public static final String DETECTION_STATE_INDEX = ".opendistro-anomaly-detection-state";
22+
// config index. We are reusing ad detector index.
23+
public static final String CONFIG_INDEX = ".opendistro-anomaly-detectors";
2224

2325
// The alias of the index in which to write AD result history
2426
public static final String ANOMALY_RESULT_INDEX_ALIAS = ".opendistro-anomaly-results";
27+
public static final String CONFIG_INDEX_NAME = "";
2528

2629
// ======================================
2730
// Anomaly Detector name for X-Opaque-Id header

src/main/java/org/opensearch/ad/indices/ADIndex.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public enum ADIndex implements TimeSeriesIndex {
3030
true,
3131
ThrowingSupplierWrapper.throwingSupplierWrapper(ADIndexManagement::getResultMappings)
3232
),
33-
CONFIG(CommonName.CONFIG_INDEX, false, ThrowingSupplierWrapper.throwingSupplierWrapper(ADIndexManagement::getConfigMappings)),
33+
CONFIG(ADCommonName.CONFIG_INDEX, false, ThrowingSupplierWrapper.throwingSupplierWrapper(ADIndexManagement::getConfigMappings)),
3434
JOB(CommonName.JOB_INDEX, false, ThrowingSupplierWrapper.throwingSupplierWrapper(ADIndexManagement::getJobMappings)),
3535
CHECKPOINT(
3636
ADCommonName.CHECKPOINT_INDEX_NAME,
@@ -65,4 +65,9 @@ public boolean isAlias() {
6565
public String getMapping() {
6666
return mapping;
6767
}
68+
69+
@Override
70+
public boolean isConfigIndex() {
71+
return ADCommonName.CONFIG_INDEX.equals(getIndexName());
72+
}
6873
}

src/main/java/org/opensearch/ad/indices/ADIndexManagement.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ public ADIndexManagement(
9494
ADIndex.RESULT.getMapping(),
9595
xContentRegistry,
9696
AnomalyDetector::parse,
97-
ADCommonName.CUSTOM_RESULT_INDEX_PREFIX
97+
ADCommonName.CUSTOM_RESULT_INDEX_PREFIX,
98+
ADCommonName.CONFIG_INDEX
9899
);
99100

100101
this.indexStates = new EnumMap<ADIndex, IndexState>(ADIndex.class);

src/main/java/org/opensearch/ad/ml/ADColdStart.java

+73-9
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,21 @@
1313

1414
import java.time.Clock;
1515
import java.time.Duration;
16+
import java.time.Instant;
17+
import java.util.ArrayList;
1618
import java.util.List;
1719

20+
import org.apache.commons.lang3.tuple.Pair;
1821
import org.apache.logging.log4j.LogManager;
1922
import org.apache.logging.log4j.Logger;
2023
import org.opensearch.ad.indices.ADIndex;
2124
import org.opensearch.ad.indices.ADIndexManagement;
2225
import org.opensearch.ad.ml.IgnoreSimilarExtractor.ThresholdArrays;
2326
import org.opensearch.ad.model.AnomalyDetector;
27+
import org.opensearch.ad.model.AnomalyResult;
2428
import org.opensearch.ad.ratelimit.ADCheckpointWriteWorker;
29+
import org.opensearch.forecast.ml.RCFCasterResult;
30+
import org.opensearch.forecast.model.ForecastResult;
2531
import org.opensearch.threadpool.ThreadPool;
2632
import org.opensearch.timeseries.AnalysisType;
2733
import org.opensearch.timeseries.NodeStateManager;
@@ -34,18 +40,21 @@
3440
import org.opensearch.timeseries.model.Config;
3541
import org.opensearch.timeseries.ratelimit.RequestPriority;
3642
import org.opensearch.timeseries.settings.TimeSeriesSettings;
43+
import org.opensearch.timeseries.util.ModelUtil;
44+
import org.opensearch.timeseries.util.ParseUtils;
3745

3846
import com.amazon.randomcutforest.config.ForestMode;
3947
import com.amazon.randomcutforest.config.Precision;
4048
import com.amazon.randomcutforest.config.TransformMethod;
49+
import com.amazon.randomcutforest.parkservices.AnomalyDescriptor;
4150
import com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest;
4251

4352
/**
4453
* Training models for HCAD detectors
4554
*
4655
*/
4756
public class ADColdStart extends
48-
ModelColdStart<ThresholdedRandomCutForest, ADIndex, ADIndexManagement, ADCheckpointDao, ADCheckpointWriteWorker> {
57+
ModelColdStart<ThresholdedRandomCutForest, ADIndex, ADIndexManagement, ADCheckpointDao, ADCheckpointWriteWorker, AnomalyResult> {
4958
private static final Logger logger = LogManager.getLogger(ADColdStart.class);
5059

5160
/**
@@ -87,7 +96,8 @@ public ADColdStart(
8796
ADCheckpointWriteWorker checkpointWriteWorker,
8897
long rcfSeed,
8998
int maxRoundofColdStart,
90-
int coolDownMinutes
99+
int coolDownMinutes,
100+
int resultSchemaVersion
91101
) {
92102
super(
93103
modelTtl,
@@ -107,7 +117,8 @@ public ADColdStart(
107117
featureManager,
108118
maxRoundofColdStart,
109119
TimeSeriesAnalyticsPlugin.AD_THREAD_POOL_NAME,
110-
AnalysisType.AD
120+
AnalysisType.AD,
121+
resultSchemaVersion
111122
);
112123
}
113124

@@ -126,7 +137,8 @@ public ADColdStart(
126137
Duration modelTtl,
127138
ADCheckpointWriteWorker checkpointWriteQueue,
128139
int maxRoundofColdStart,
129-
int coolDownMinutes
140+
int coolDownMinutes,
141+
int resultSchemaVersion
130142
) {
131143
this(
132144
clock,
@@ -144,7 +156,8 @@ public ADColdStart(
144156
checkpointWriteQueue,
145157
-1,
146158
maxRoundofColdStart,
147-
coolDownMinutes
159+
coolDownMinutes,
160+
resultSchemaVersion
148161
);
149162
}
150163

@@ -158,7 +171,7 @@ public ADColdStart(
158171
* training data in result index so that the frontend can plot it.
159172
*/
160173
@Override
161-
protected List<Sample> trainModelFromDataSegments(
174+
protected List<AnomalyResult> trainModelFromDataSegments(
162175
List<Sample> pointSamples,
163176
ModelState<ThresholdedRandomCutForest> entityState,
164177
Config config,
@@ -185,6 +198,7 @@ protected List<Sample> trainModelFromDataSegments(
185198
.numberOfTrees(numberOfTrees)
186199
.timeDecay(config.getTimeDecay())
187200
.transformDecay(config.getTimeDecay())
201+
// allow enough samples before emitting scores to park service
188202
.outputAfter(Math.max(shingleSize, numMinSamples))
189203
.initialAcceptFraction(initialAcceptFraction)
190204
.parallelExecutionEnabled(false)
@@ -221,21 +235,71 @@ protected List<Sample> trainModelFromDataSegments(
221235
// use build instead of new TRCF(Builder) because build method did extra validation and initialization
222236
ThresholdedRandomCutForest trcf = rcfBuilder.build();
223237

238+
// Prepare for sequential processing
239+
double[][] sequentialData = new double[pointSamples.size()][];
240+
List<Pair<Instant, Instant>> sequentialTime = new ArrayList<>();
241+
242+
// Convert the list of Sample objects into a 2D array + a parallel list of time pairs
224243
for (int i = 0; i < pointSamples.size(); i++) {
225244
Sample dataSample = pointSamples.get(i);
226245
double[] dataValue = dataSample.getValueList();
227-
// We don't keep missing values during cold start as the actual data may not be reconstructed during the early stage.
228-
trcf.process(dataValue, dataSample.getDataEndTime().getEpochSecond());
246+
247+
sequentialData[i] = dataValue;
248+
// Store start and end times together
249+
sequentialTime.add(Pair.of(dataSample.getDataStartTime(), dataSample.getDataEndTime()));
250+
}
251+
252+
// Process data in one go
253+
List<AnomalyDescriptor> descriptors = trcf.processSequentially(sequentialData, x -> true);
254+
255+
// Check for size mismatch
256+
if (descriptors.size() != sequentialTime.size()) {
257+
logger.warn(
258+
"processSequentially returned a different size than expected: got [{}], expected [{}].",
259+
descriptors.size(),
260+
sequentialTime.size()
261+
);
262+
return null;
263+
}
264+
265+
// Build anomaly results from sequential descriptors
266+
List<AnomalyResult> results = new ArrayList<>();
267+
for (int i = 0; i < descriptors.size(); i++) {
268+
AnomalyDescriptor descriptor = descriptors.get(i);
269+
double[] dataValue = sequentialData[i];
270+
Pair<Instant, Instant> time = sequentialTime.get(i);
271+
272+
// Convert the descriptor into a thresholding result, or anomaly result
273+
ThresholdingResult thresholdingResult =
274+
ModelUtil.toResult(trcf.getForest(), descriptor, dataValue, false, config);
275+
276+
Instant now = Instant.now();
277+
results.addAll(
278+
thresholdingResult.toIndexableResults(
279+
config,
280+
time.getLeft(), // Data start time
281+
time.getRight(), // Data end time
282+
now,
283+
now,
284+
ParseUtils.getFeatureData(dataValue, config),
285+
entityState.getEntity(),
286+
resultMappingVersion,
287+
entityState.getModelId(),
288+
taskId,
289+
null
290+
)
291+
);
229292
}
230293

294+
231295
entityState.setModel(trcf);
232296

233297
entityState.setLastUsedTime(clock.instant());
234298

235299
// save to checkpoint
236300
checkpointWriteWorker.write(entityState, true, RequestPriority.MEDIUM);
237301

238-
return pointSamples;
302+
return results;
239303
}
240304

241305
public static void applyRule(ThresholdedRandomCutForest.Builder rcfBuilder, AnomalyDetector detector) {

0 commit comments

Comments
 (0)