build: use breeze BlockRadixSort instead of cub::BlockRadixSort

David Reveman · David Reveman · commit 4bf8ffacbdc3 · 2025-03-05T14:56:57.000-05:00
Also switches to using warp-striped arrangement as more efficient
for radix sort and required by breeze.
diff --git a/velox/experimental/wave/common/Block.cuh b/velox/experimental/wave/common/Block.cuh
@@ -18,11 +18,11 @@
 
 #include <breeze/functions/reduce.h>
 #include <breeze/functions/scan.h>
+#include <breeze/functions/sort.h>
 #include <breeze/functions/store.h>
 #include <breeze/platforms/platform.h>
 #include <breeze/utils/types.h>
 #include <breeze/platforms/cuda.cuh>
-#include <cub/block/block_radix_sort.cuh>
 #include "velox/experimental/wave/common/CudaUtil.cuh"
 
 /// Utilities for  booleans and indices and thread blocks.
@@ -137,8 +137,12 @@ template <
     int32_t kItemsPerThread,
     typename Key,
     typename Value>
-using RadixSort =
-    typename cub::BlockRadixSort<Key, kBlockSize, kItemsPerThread, Value>;
+using RadixSort = typename breeze::functions::BlockRadixSort<
+    CudaPlatform<kBlockSize, kWarpThreads>,
+    kItemsPerThread,
+    /*RADIX_BITS=*/8,
+    Key,
+    Value>;
 
 template <
     int32_t kBlockSize,
@@ -147,7 +151,7 @@ template <
     typename Value>
 inline int32_t __host__ __device__ blockSortSharedSize() {
   return sizeof(
-      typename RadixSort<kBlockSize, kItemsPerThread, Key, Value>::TempStorage);
+      typename RadixSort<kBlockSize, kItemsPerThread, Key, Value>::Scratch);
 }
 
 template <
@@ -165,7 +169,9 @@ void __device__ blockSort(
     char* smem) {
   using namespace breeze::functions;
   using namespace breeze::utils;
-  using Sort = cub::BlockRadixSort<Key, kBlockSize, kItemsPerThread, Value>;
+
+  CudaPlatform<kBlockSize, kWarpThreads> p;
+  using RadixSortT = RadixSort<kBlockSize, kItemsPerThread, Key, Value>;
 
   // Per-thread tile items
   Key keys[kItemsPerThread];
@@ -174,28 +180,37 @@ void __device__ blockSort(
   // Our current block's offset
   int blockOffset = 0;
 
-  // Load items into a blocked arrangement
+  constexpr int32_t kWarpItems = kWarpThreads * kItemsPerThread;
+  static_assert(
+      (kBlockSize % kWarpThreads) == 0,
+      "kBlockSize must be a multiple of kWarpThreads");
+
+  // Load items into a warp-striped arrangement
+  int32_t threadOffset = p.warp_idx() * kWarpItems + p.lane_idx();
   for (auto i = 0; i < kItemsPerThread; ++i) {
-    int32_t idx = blockOffset + i * kBlockSize + threadIdx.x;
+    int32_t idx = blockOffset + threadOffset + i * kWarpThreads;
     values[i] = valueGetter(idx);
     keys[i] = keyGetter(idx);
   }
 
   __syncthreads();
-  auto* temp_storage = reinterpret_cast<typename Sort::TempStorage*>(smem);
+  auto* temp_storage = reinterpret_cast<typename RadixSortT::Scratch*>(smem);
 
-  Sort(*temp_storage).SortBlockedToStriped(keys, values);
+  RadixSortT::Sort(
+      p,
+      make_slice<THREAD, WARP_STRIPED>(keys),
+      make_slice<THREAD, WARP_STRIPED>(values),
+      make_slice(temp_storage).template reinterpret<SHARED>());
 
-  // Store a striped arrangement of output across the thread block into a linear
-  // segment of items
-  CudaPlatform<kBlockSize, kWarpThreads> p;
+  // Store a warp-striped arrangement of output across the thread block into a
+  // linear segment of items
   BlockStore<kBlockSize, kItemsPerThread>(
       p,
-      make_slice<THREAD, STRIPED>(values),
+      make_slice<THREAD, WARP_STRIPED>(values),
       make_slice<GLOBAL>(valueOut + blockOffset));
   BlockStore<kBlockSize, kItemsPerThread>(
       p,
-      make_slice<THREAD, STRIPED>(keys),
+      make_slice<THREAD, WARP_STRIPED>(keys),
       make_slice<GLOBAL>(keyOut + blockOffset));
   __syncthreads();
 }
diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu
@@ -187,10 +187,8 @@ void __global__ __launch_bounds__(1024)
     testSortNoShared(uint16_t** keys, uint16_t** values, char* smem) {
   auto keyBase = keys[blockIdx.x];
   auto valueBase = values[blockIdx.x];
-  char* tbTemp = smem +
-      blockIdx.x *
-          sizeof(typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::
-                     TempStorage);
+  char* tbTemp =
+      smem + blockIdx.x * blockSortSharedSize<256, 32, uint16_t, uint16_t>();
 
   blockSort<256, 32>(
       [&](auto i) { return keyBase[i]; },
@@ -202,16 +200,14 @@ void __global__ __launch_bounds__(1024)
 }
 
 int32_t BlockTestStream::sort16SharedSize() {
-  return sizeof(
-      typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::TempStorage);
+  return blockSortSharedSize<256, 32, uint16_t, uint16_t>();
 }
 
 void BlockTestStream::testSort16(
     int32_t numBlocks,
     uint16_t** keys,
     uint16_t** values) {
-  auto tempBytes = sizeof(
-      typename cub::BlockRadixSort<uint16_t, 256, 32, uint16_t>::TempStorage);
+  auto tempBytes = blockSortSharedSize<256, 32, uint16_t, uint16_t>();
 
   testSort<<<numBlocks, 256, tempBytes, stream_->stream>>>(keys, values);
 }