facebookincubator · NEUpanning · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 7, 2025
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
@@ -333,6 +333,12 @@ class QueryConfig {
   static constexpr const char* kSparkLegacyDateFormatter =
       "spark.legacy_date_formatter";
 
+  /// if true, statistical aggregation function includes skewness, kurtosis,
+  /// will return std::numeric_limits<double>::quiet_NaN() instead of NULL when
+  /// DivideByZero occurs during expression evaluation.
+  static constexpr const char* kSparkLegacyStatisticalAggregate =
+      "spark.legacy_statistical_aggregate";
+
   /// The number of local parallel table writer operators per task.
   static constexpr const char* kTaskWriterCount = "task_writer_count";
 
@@ -831,6 +837,10 @@ class QueryConfig {
     return get<bool>(kSparkLegacyDateFormatter, false);
   }
 
+  bool sparkLegacyStatisticalAggregate() const {
+    return get<bool>(kSparkLegacyStatisticalAggregate, false);
+  }
+
   bool exprTrackCpuUsage() const {
     return get<bool>(kExprTrackCpuUsage, false);
   }

diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
@@ -887,6 +887,11 @@ Spark-specific Configuration
        Joda date formatter performs strict checking of its input and uses different pattern string.
        For example, the 2015-07-22 10:00:00 timestamp cannot be parsed if pattern is yyyy-MM-dd because the parser does not consume whole input.
        Another example is that the 'W' pattern, which means week in month, is not supported. For more differences, see :issue:`10354`.
+   * - spark.legacy_statistical_aggregate
+     - bool
+     - false
+     - if true, statistical aggregation function includes skewness, kurtosis will return std::numeric_limits<double>::quiet_NaN()
+     - instead of NULL when DivideByZero occurs during expression evaluation.
 
 Tracing
 --------

diff --git a/velox/functions/sparksql/aggregates/CentralMomentsAggregate.cpp b/velox/functions/sparksql/aggregates/CentralMomentsAggregate.cpp
@@ -15,40 +15,51 @@
  */
 
 #include "velox/functions/sparksql/aggregates/CentralMomentsAggregate.h"
+#include <limits>
 #include "velox/functions/lib/aggregates/CentralMomentsAggregatesBase.h"
 
 namespace facebook::velox::functions::aggregate::sparksql {
 
 namespace {
+template <bool nullOnDivideByZero>
 struct SkewnessResultAccessor {
   static bool hasResult(const CentralMomentsAccumulator& accumulator) {
-    return accumulator.count() >= 1 && accumulator.m2() != 0;
+    if constexpr (nullOnDivideByZero) {
+      return accumulator.count() >= 1 && accumulator.m2() != 0;
+    }
+    return accumulator.count() >= 1;
   }
 
   static double result(const CentralMomentsAccumulator& accumulator) {
+    if (accumulator.m2() == 0) {
+      return std::numeric_limits<double>::quiet_NaN();
+    }
     return std::sqrt(accumulator.count()) * accumulator.m3() /
         std::pow(accumulator.m2(), 1.5);
   }
 };
 
+template <bool nullOnDivideByZero>
 struct KurtosisResultAccessor {
   static bool hasResult(const CentralMomentsAccumulator& accumulator) {
-    return accumulator.count() >= 1 && accumulator.m2() != 0;
+    if constexpr (nullOnDivideByZero) {
+      return accumulator.count() >= 1 && accumulator.m2() != 0;
+    }
+    return accumulator.count() >= 1;
   }
 
   static double result(const CentralMomentsAccumulator& accumulator) {
+    if (accumulator.m2() == 0) {
+      return std::numeric_limits<double>::quiet_NaN();
+    }
     double count = accumulator.count();
     double m2 = accumulator.m2();
     double m4 = accumulator.m4();
     return count * m4 / (m2 * m2) - 3.0;
   }
 };
 
-template <typename TResultAccessor>
-exec::AggregateRegistrationResult registerCentralMoments(
-    const std::string& name,
-    bool withCompanionFunctions,
-    bool overwrite) {
+std::vector<std::shared_ptr<exec::AggregateFunctionSignature>> getSignatures() {
   std::vector<std::shared_ptr<exec::AggregateFunctionSignature>> signatures;
   std::vector<std::string> inputTypes = {
       "smallint", "integer", "bigint", "real", "double"};
@@ -60,6 +71,115 @@ exec::AggregateRegistrationResult registerCentralMoments(
             .argumentType(inputType)
             .build());
   }
+  return signatures;
+}
+
+exec::AggregateRegistrationResult registerSkewness(
+    const std::string& name,
+    bool withCompanionFunctions,
+    bool overwrite) {
+  std::vector<std::shared_ptr<exec::AggregateFunctionSignature>> signatures =
+      getSignatures();
+
+  return exec::registerAggregateFunction(
+      name,
+      std::move(signatures),
+      [name](
+          core::AggregationNode::Step step,
+          const std::vector<TypePtr>& argTypes,
+          const TypePtr& resultType,
+          const core::QueryConfig& config) -> std::unique_ptr<exec::Aggregate> {
+        VELOX_CHECK_LE(
+            argTypes.size(), 1, "{} takes at most one argument", name);
+        const auto& inputType = argTypes[0];
+        if (config.sparkLegacyStatisticalAggregate()) {
+          if (exec::isRawInput(step)) {
+            switch (inputType->kind()) {
+              case TypeKind::SMALLINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int16_t,
+                    SkewnessResultAccessor<false>>>(resultType);
+              case TypeKind::INTEGER:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int32_t,
+                    SkewnessResultAccessor<false>>>(resultType);
+              case TypeKind::BIGINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int64_t,
+                    SkewnessResultAccessor<false>>>(resultType);
+              case TypeKind::DOUBLE:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    double,
+                    SkewnessResultAccessor<false>>>(resultType);
+              case TypeKind::REAL:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    float,
+                    SkewnessResultAccessor<false>>>(resultType);
+              default:
+                VELOX_UNSUPPORTED(
+                    "Unsupported input type: {}. "
+                    "Expected SMALLINT, INTEGER, BIGINT, DOUBLE or REAL.",
+                    inputType->toString());
+            }
+          } else {
+            checkAccumulatorRowType(
+                inputType,
+                "Input type for final aggregation must be "
+                "(count:bigint, m1:double, m2:double, m3:double, m4:double) struct");
+            return std::make_unique<CentralMomentsAggregatesBase<
+                int64_t /*unused*/,
+                SkewnessResultAccessor<false>>>(resultType);
+          }
+        } else {
+          if (exec::isRawInput(step)) {
+            switch (inputType->kind()) {
+              case TypeKind::SMALLINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int16_t,
+                    SkewnessResultAccessor<true>>>(resultType);
+              case TypeKind::INTEGER:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int32_t,
+                    SkewnessResultAccessor<true>>>(resultType);
+              case TypeKind::BIGINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int64_t,
+                    SkewnessResultAccessor<true>>>(resultType);
+              case TypeKind::DOUBLE:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    double,
+                    SkewnessResultAccessor<true>>>(resultType);
+              case TypeKind::REAL:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    float,
+                    SkewnessResultAccessor<true>>>(resultType);
+              default:
+                VELOX_UNSUPPORTED(
+                    "Unsupported input type: {}. "
+                    "Expected SMALLINT, INTEGER, BIGINT, DOUBLE or REAL.",
+                    inputType->toString());
+            }
+          } else {
+            checkAccumulatorRowType(
+                inputType,
+                "Input type for final aggregation must be "
+                "(count:bigint, m1:double, m2:double, m3:double, m4:double) struct");
+            return std::make_unique<CentralMomentsAggregatesBase<
+                int64_t /*unused*/,
+                SkewnessResultAccessor<true>>>(resultType);
+          }
+        }
+      },
+      withCompanionFunctions,
+      overwrite);
+}
+
+exec::AggregateRegistrationResult registerKurtosis(
+    const std::string& name,
+    bool withCompanionFunctions,
+    bool overwrite) {
+  std::vector<std::shared_ptr<exec::AggregateFunctionSignature>> signatures =
+      getSignatures();
 
   return exec::registerAggregateFunction(
       name,
@@ -68,47 +188,86 @@ exec::AggregateRegistrationResult registerCentralMoments(
           core::AggregationNode::Step step,
           const std::vector<TypePtr>& argTypes,
           const TypePtr& resultType,
-          const core::QueryConfig& /*config*/)
-          -> std::unique_ptr<exec::Aggregate> {
+          const core::QueryConfig& config) -> std::unique_ptr<exec::Aggregate> {
         VELOX_CHECK_LE(
             argTypes.size(), 1, "{} takes at most one argument", name);
         const auto& inputType = argTypes[0];
-        if (exec::isRawInput(step)) {
-          switch (inputType->kind()) {
-            case TypeKind::SMALLINT:
-              return std::make_unique<
-                  CentralMomentsAggregatesBase<int16_t, TResultAccessor>>(
-                  resultType);
-            case TypeKind::INTEGER:
-              return std::make_unique<
-                  CentralMomentsAggregatesBase<int32_t, TResultAccessor>>(
-                  resultType);
-            case TypeKind::BIGINT:
-              return std::make_unique<
-                  CentralMomentsAggregatesBase<int64_t, TResultAccessor>>(
-                  resultType);
-            case TypeKind::DOUBLE:
-              return std::make_unique<
-                  CentralMomentsAggregatesBase<double, TResultAccessor>>(
-                  resultType);
-            case TypeKind::REAL:
-              return std::make_unique<
-                  CentralMomentsAggregatesBase<float, TResultAccessor>>(
-                  resultType);
-            default:
-              VELOX_UNSUPPORTED(
-                  "Unsupported input type: {}. "
-                  "Expected SMALLINT, INTEGER, BIGINT, DOUBLE or REAL.",
-                  inputType->toString());
+        if (config.sparkLegacyStatisticalAggregate()) {
+          if (exec::isRawInput(step)) {
+            switch (inputType->kind()) {
+              case TypeKind::SMALLINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int16_t,
+                    KurtosisResultAccessor<false>>>(resultType);
+              case TypeKind::INTEGER:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int32_t,
+                    KurtosisResultAccessor<false>>>(resultType);
+              case TypeKind::BIGINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int64_t,
+                    KurtosisResultAccessor<false>>>(resultType);
+              case TypeKind::DOUBLE:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    double,
+                    KurtosisResultAccessor<false>>>(resultType);
+              case TypeKind::REAL:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    float,
+                    KurtosisResultAccessor<false>>>(resultType);
+              default:
+                VELOX_UNSUPPORTED(
+                    "Unsupported input type: {}. "
+                    "Expected SMALLINT, INTEGER, BIGINT, DOUBLE or REAL.",
+                    inputType->toString());
+            }
+          } else {
+            checkAccumulatorRowType(
+                inputType,
+                "Input type for final aggregation must be "
+                "(count:bigint, m1:double, m2:double, m3:double, m4:double) struct");
+            return std::make_unique<CentralMomentsAggregatesBase<
+                int64_t /*unused*/,
+                KurtosisResultAccessor<false>>>(resultType);
           }
         } else {
-          checkAccumulatorRowType(
-              inputType,
-              "Input type for final aggregation must be "
-              "(count:bigint, m1:double, m2:double, m3:double, m4:double) struct");
-          return std::make_unique<CentralMomentsAggregatesBase<
-              int64_t /*unused*/,
-              TResultAccessor>>(resultType);
+          if (exec::isRawInput(step)) {
+            switch (inputType->kind()) {
+              case TypeKind::SMALLINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int16_t,
+                    KurtosisResultAccessor<true>>>(resultType);
+              case TypeKind::INTEGER:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int32_t,
+                    KurtosisResultAccessor<true>>>(resultType);
+              case TypeKind::BIGINT:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    int64_t,
+                    KurtosisResultAccessor<true>>>(resultType);
+              case TypeKind::DOUBLE:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    double,
+                    KurtosisResultAccessor<true>>>(resultType);
+              case TypeKind::REAL:
+                return std::make_unique<CentralMomentsAggregatesBase<
+                    float,
+                    KurtosisResultAccessor<true>>>(resultType);
+              default:
+                VELOX_UNSUPPORTED(
+                    "Unsupported input type: {}. "
+                    "Expected SMALLINT, INTEGER, BIGINT, DOUBLE or REAL.",
+                    inputType->toString());
+            }
+          } else {
+            checkAccumulatorRowType(
+                inputType,
+                "Input type for final aggregation must be "
+                "(count:bigint, m1:double, m2:double, m3:double, m4:double) struct");
+            return std::make_unique<CentralMomentsAggregatesBase<
+                int64_t /*unused*/,
+                KurtosisResultAccessor<true>>>(resultType);
+          }
         }
       },
       withCompanionFunctions,
@@ -120,10 +279,8 @@ void registerCentralMomentsAggregate(
     const std::string& prefix,
     bool withCompanionFunctions,
     bool overwrite) {
-  registerCentralMoments<SkewnessResultAccessor>(
-      prefix + "skewness", withCompanionFunctions, overwrite);
-  registerCentralMoments<KurtosisResultAccessor>(
-      prefix + "kurtosis", withCompanionFunctions, overwrite);
+  registerSkewness(prefix + "skewness", withCompanionFunctions, overwrite);
+  registerKurtosis(prefix + "kurtosis", withCompanionFunctions, overwrite);
 }
 
 } // namespace facebook::velox::functions::aggregate::sparksql
diff --git a/velox/functions/sparksql/aggregates/tests/CentralMomentsAggregationTest.cpp b/velox/functions/sparksql/aggregates/tests/CentralMomentsAggregationTest.cpp
@@ -40,6 +40,18 @@ class CentralMomentsAggregationTest : public AggregationTestBase {
     builder.singleAggregation({}, {fmt::format("spark_{}(c0)", agg)});
     AssertQueryBuilder(builder.planNode()).assertResults({expected});
   }
+
+  void testLegacyCenteralMomentsAggResult(
+      const std::string& agg,
+      const RowVectorPtr& input,
+      const RowVectorPtr& expected) {
+    PlanBuilder builder(pool());
+    builder.values({input});
+    builder.singleAggregation({}, {fmt::format("spark_{}(c0)", agg)});
+    AssertQueryBuilder(builder.planNode())
+        .config("spark.legacy_statistical_aggregate", "true")
+        .assertResults({expected});
+  }
 };
 
 TEST_F(CentralMomentsAggregationTest, skewnessHasResult) {
@@ -54,6 +66,19 @@ TEST_F(CentralMomentsAggregationTest, skewnessHasResult) {
   expected = makeRowVector({makeNullableFlatVector<double>(
       std::vector<std::optional<double>>{std::nullopt})});
   testCenteralMomentsAggResult(agg, input, expected);
+
+  // Output NULL when m2 equals 0.
+  input = makeRowVector({makeFlatVector<int32_t>({1, 1})});
+  expected = makeRowVector({makeNullableFlatVector<double>(
+      std::vector<std::optional<double>>{std::nullopt})});
+  testCenteralMomentsAggResult(agg, input, expected);
+
+  // Output NaN when m2 equals 0 for legacy aggregate.
+  input = makeRowVector({makeFlatVector<int32_t>({1, 1})});
+  expected = makeRowVector(
+      {makeNullableFlatVector<double>(std::vector<std::optional<double>>{
+          std::numeric_limits<double>::quiet_NaN()})});
+  testLegacyCenteralMomentsAggResult(agg, input, expected);
 }
 
 TEST_F(CentralMomentsAggregationTest, pearsonKurtosis) {
@@ -78,6 +103,13 @@ TEST_F(CentralMomentsAggregationTest, pearsonKurtosis) {
   expected = makeRowVector({makeNullableFlatVector<double>(
       std::vector<std::optional<double>>{std::nullopt})});
   testCenteralMomentsAggResult(agg, input, expected);
+
+  // Output NaN when m2 equals 0 for legacy aggregate.
+  input = makeRowVector({makeFlatVector<int32_t>({1, 1})});
+  expected = makeRowVector(
+      {makeNullableFlatVector<double>(std::vector<std::optional<double>>{
+          std::numeric_limits<double>::quiet_NaN()})});
+  testLegacyCenteralMomentsAggResult(agg, input, expected);
 }
 
 } // namespace