Fix NaN in collect_set

rui-mo · rui-mo · commit 9c5f951db42a · 2025-02-14T16:38:23.000Z
diff --git a/velox/docs/functions/spark/aggregate.rst b/velox/docs/functions/spark/aggregate.rst
@@ -61,7 +61,7 @@ General Aggregate Functions
 .. spark:function:: collect_set(x) -> array<[same as x]>
 
     Returns an array consisting of all unique values from the input ``x`` elements excluding NULLs.
-    Returns empty array if input is empty or all NULL.
+    NaN values are considered distinct. Returns empty array if input is empty or all NULL.
 
     Example::
 
diff --git a/velox/exec/SetAccumulator.h b/velox/exec/SetAccumulator.h
@@ -523,4 +523,9 @@ template <typename T>
 using SetAccumulator =
     typename detail::SetAccumulatorTypeTraits<T>::AccumulatorType;
 
+/// Specialization for floating point types to handle NaNs, where NaNs are
+/// treated as distinct values.
+template <typename T>
+using FloatSetAccumulatorNaNUnaware = typename detail::SetAccumulator<T>;
+
 } // namespace facebook::velox::aggregate::prestosql
diff --git a/velox/functions/sparksql/aggregates/CollectSetAggregate.cpp b/velox/functions/sparksql/aggregates/CollectSetAggregate.cpp
@@ -24,6 +24,14 @@ namespace {
 template <typename T>
 using SparkSetAggAggregate = SetAggAggregate<T, true, false>;
 
+// NaN inputs are treated as distinct values.
+template <typename T>
+using FloatSetAggAggregateNaNUnaware = SetAggAggregate<
+    T,
+    true,
+    false,
+    velox::aggregate::prestosql::FloatSetAccumulatorNaNUnaware<T>>;
+
 } // namespace
 
 void registerCollectSetAggAggregate(
@@ -72,9 +80,11 @@ void registerCollectSetAggAggregate(
                 "Non-decimal use of HUGEINT is not supported");
             return std::make_unique<SparkSetAggAggregate<int128_t>>(resultType);
           case TypeKind::REAL:
-            return std::make_unique<SparkSetAggAggregate<float>>(resultType);
+            return std::make_unique<FloatSetAggAggregateNaNUnaware<float>>(
+                resultType);
           case TypeKind::DOUBLE:
-            return std::make_unique<SparkSetAggAggregate<double>>(resultType);
+            return std::make_unique<FloatSetAggAggregateNaNUnaware<double>>(
+                resultType);
           case TypeKind::TIMESTAMP:
             return std::make_unique<SparkSetAggAggregate<Timestamp>>(
                 resultType);
diff --git a/velox/functions/sparksql/aggregates/tests/CollectSetAggregateTest.cpp b/velox/functions/sparksql/aggregates/tests/CollectSetAggregateTest.cpp
@@ -70,6 +70,7 @@ TEST_F(CollectSetAggregateTest, global) {
   testAggregations(
       {data}, {}, {"collect_set(c0)"}, {"spark_array_sort(a0)"}, {expected});
 
+  // NaN inputs are treated as distinct values.
   data = makeRowVector({
       makeFlatVector<double>(
           {1,
@@ -80,7 +81,10 @@ TEST_F(CollectSetAggregateTest, global) {
 
   expected = makeRowVector({
       makeArrayVector<double>({
-          {1, std::numeric_limits<double>::quiet_NaN()},
+          {1,
+           std::numeric_limits<double>::quiet_NaN(),
+           std::nan("1"),
+           std::nan("2")},
       }),
   });