Throw an exception when memory usage estimation endpoint encounters empty data frame. (#49143) (#49164)

przemekwitek · web-flow · commit 150db2b54405 · 2019-11-18T07:52:57.000+01:00
diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
@@ -92,6 +92,7 @@ integTest.runner {
     'ml/data_frame_analytics_crud/Test put classification given num_top_classes is greater than 1k',
     'ml/data_frame_analytics_crud/Test put classification given training_percent is less than one',
     'ml/data_frame_analytics_crud/Test put classification given training_percent is greater than hundred',
+    'ml/data_frame_analytics_memory_usage_estimation/Test memory usage estimation for empty data frame',
     'ml/evaluate_data_frame/Test given missing index',
     'ml/evaluate_data_frame/Test given index does not exist',
     'ml/evaluate_data_frame/Test given missing evaluation',
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java
@@ -238,11 +238,13 @@ private void getStartContext(String id, ActionListener<StartContext> finalListen
                             .collectDataSummaryAsync(ActionListener.wrap(
                                 dataSummary -> {
                                     if (dataSummary.rows == 0) {
-                                        finalListener.onFailure(new ElasticsearchStatusException(
-                                            "Unable to start {} as there are no analyzable data in source indices [{}].",
-                                            RestStatus.BAD_REQUEST,
-                                            id,
-                                            Strings.arrayToCommaDelimitedString(startContext.config.getSource().getIndex())
+                                        finalListener.onFailure(ExceptionsHelper.badRequestException(
+                                            "Unable to start {} as no documents in the source indices [{}] contained all the fields "
+                                                + "selected for analysis. If you are relying on automatic field selection then there are "
+                                                + "currently mapped fields that do not exist in any indexed documents, and you will have "
+                                                + "to switch to explicit field selection and include only fields that exist in indexed "
+                                                + "documents.",
+                                            id, Strings.arrayToCommaDelimitedString(startContext.config.getSource().getIndex())
                                         ));
                                     } else {
                                         finalListener.onResponse(startContext);
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/MemoryUsageEstimationProcessManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/MemoryUsageEstimationProcessManager.java
@@ -9,6 +9,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
@@ -57,10 +58,16 @@ private MemoryUsageEstimationResult runJob(String jobId,
                                                DataFrameDataExtractorFactory dataExtractorFactory) {
         DataFrameDataExtractor dataExtractor = dataExtractorFactory.newExtractor(false);
         DataFrameDataExtractor.DataSummary dataSummary = dataExtractor.collectDataSummary();
-        Set<String> categoricalFields = dataExtractor.getCategoricalFields(config.getAnalysis());
         if (dataSummary.rows == 0) {
-            return new MemoryUsageEstimationResult(ByteSizeValue.ZERO, ByteSizeValue.ZERO);
+            throw ExceptionsHelper.badRequestException(
+                "[{}] Unable to estimate memory usage as no documents in the source indices [{}] contained all the fields selected for "
+                    + "analysis. If you are relying on automatic field selection then there are currently mapped fields that do not exist "
+                    + "in any indexed documents, and you will have to switch to explicit field selection and include only fields that "
+                    + "exist in indexed documents.",
+                jobId,
+                Strings.arrayToCommaDelimitedString(config.getSource().getIndex()));
         }
+        Set<String> categoricalFields = dataExtractor.getCategoricalFields(config.getAnalysis());
         AnalyticsProcessConfig processConfig =
             new AnalyticsProcessConfig(
                 jobId,
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/MemoryUsageEstimationProcessManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/process/MemoryUsageEstimationProcessManagerTests.java
@@ -42,8 +42,6 @@ public class MemoryUsageEstimationProcessManagerTests extends ESTestCase {
     private static final String CONFIG_ID = "dummy";
     private static final int NUM_ROWS = 100;
     private static final int NUM_COLS = 4;
-    private static final MemoryUsageEstimationResult PROCESS_RESULT_ZERO =
-        new MemoryUsageEstimationResult(ByteSizeValue.ZERO, ByteSizeValue.ZERO);
     private static final MemoryUsageEstimationResult PROCESS_RESULT =
         new MemoryUsageEstimationResult(ByteSizeValue.parseBytesSizeValue("20kB", ""), ByteSizeValue.parseBytesSizeValue("10kB", ""));
 
@@ -85,9 +83,11 @@ public void testRunJob_EmptyDataFrame() {
 
         processManager.runJobAsync(TASK_ID, dataFrameAnalyticsConfig, dataExtractorFactory, listener);
 
-        verify(listener).onResponse(resultCaptor.capture());
-        MemoryUsageEstimationResult result = resultCaptor.getValue();
-        assertThat(result, equalTo(PROCESS_RESULT_ZERO));
+        verify(listener).onFailure(exceptionCaptor.capture());
+        ElasticsearchException exception = (ElasticsearchException) exceptionCaptor.getValue();
+        assertThat(exception.status(), equalTo(RestStatus.BAD_REQUEST));
+        assertThat(exception.getMessage(), containsString(TASK_ID));
+        assertThat(exception.getMessage(), containsString("Unable to estimate memory usage"));
 
         verifyNoMoreInteractions(process, listener);
     }
diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/data_frame_analytics_memory_usage_estimation.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/data_frame_analytics_memory_usage_estimation.yml
@@ -14,12 +14,27 @@ setup:
 ---
 "Test memory usage estimation for empty data frame":
   - do:
+      catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
+      ml.estimate_memory_usage:
+        body:
+          source: { index: "index-source" }
+          analysis: { outlier_detection: {} }
+
+  - do:
+      index:
+        index: index-source
+        refresh: true
+        body: { x: 1 }
+  - match: { result: "created" }
+
+  # Note that value for "y" is missing and outlier detection analysis does not support missing values.
+  # Hence, the data frame is still considered empty.
+  - do:
+      catch: /Unable to estimate memory usage as no documents in the source indices \[index-source\] contained all the fields selected for analysis/
       ml.estimate_memory_usage:
         body:
           source: { index: "index-source" }
           analysis: { outlier_detection: {} }
-  - match: { expected_memory_without_disk: "0" }
-  - match: { expected_memory_with_disk: "0" }
 
 ---
 "Test memory usage estimation for non-empty data frame":
diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml
@@ -86,7 +86,7 @@
           }
 
   - do:
-      catch: /Unable to start empty-with-compatible-fields as there are no analyzable data in source indices \[empty-index-with-compatible-fields\]/
+      catch: /Unable to start empty-with-compatible-fields as no documents in the source indices \[empty-index-with-compatible-fields\] contained all the fields selected for analysis/
       ml.start_data_frame_analytics:
         id: "empty-with-compatible-fields"
 ---

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@`
`86`	`86`	`}`
`87`	`87`
`88`	`88`	`- do:`
`89`		`- catch: /Unable to start empty-with-compatible-fields as there are no analyzable data in source indices \[empty-index-with-compatible-fields\]/`
	`89`	`+ catch: /Unable to start empty-with-compatible-fields as no documents in the source indices \[empty-index-with-compatible-fields\] contained all the fields selected for analysis/`
`90`	`90`	`ml.start_data_frame_analytics:`
`91`	`91`	`id: "empty-with-compatible-fields"`
`92`	`92`	`---`