fbusato
diff --git a/‎c/parallel/include/cccl/c/unique_by_key.h
+66 b/‎c/parallel/include/cccl/c/unique_by_key.h
+66
diff --git a/‎c/parallel/src/kernels/iterators.cpp
+9-9 b/‎c/parallel/src/kernels/iterators.cpp
+9-9
diff --git a/‎c/parallel/src/merge_sort.cu
+5-9 b/‎c/parallel/src/merge_sort.cu
+5-9
diff --git a/‎c/parallel/src/scan.cu
+3-107 b/‎c/parallel/src/scan.cu
+3-107
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA Core Compute Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
+#endif // !CCCL_C_EXPERIMENTAL
+
+#include <cuda.h>
+
+#include <cccl/c/extern_c.h>
+#include <cccl/c/types.h>
+
+CCCL_C_EXTERN_C_BEGIN
+
+typedef struct cccl_device_unique_by_key_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  CUkernel compact_init_kernel;
+  CUkernel sweep_kernel;
+  size_t description_bytes_per_tile;
+  size_t payload_bytes_per_tile;
+} cccl_device_unique_by_key_build_result_t;
+
+CCCL_C_API CUresult cccl_device_unique_by_key_build(
+  cccl_device_unique_by_key_build_result_t* build,
+  cccl_iterator_t d_keys_in,
+  cccl_iterator_t d_values_in,
+  cccl_iterator_t d_keys_out,
+  cccl_iterator_t d_values_out,
+  cccl_iterator_t d_num_selected_out,
+  cccl_op_t op,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+CCCL_C_API CUresult cccl_device_unique_by_key(
+  cccl_device_unique_by_key_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_keys_in,
+  cccl_iterator_t d_values_in,
+  cccl_iterator_t d_keys_out,
+  cccl_iterator_t d_values_out,
+  cccl_iterator_t d_num_selected_out,
+  cccl_op_t op,
+  unsigned long long num_items,
+  CUstream stream) noexcept;
+
+CCCL_C_API CUresult cccl_device_unique_by_key_cleanup(cccl_device_unique_by_key_build_result_t* bld_ptr) noexcept;
+
+CCCL_C_EXTERN_C_END
@@ -97,28 +97,28 @@ std::string make_kernel_output_iterator(
   const std::string iter_def = std::format(R"XXX(
 extern "C" __device__ void DEREF(const void *self_ptr, VALUE_T x);
 extern "C" __device__ void ADVANCE(void *self_ptr, DIFF_T offset);
-struct __align__(OP_ALIGNMENT) output_iterator_state_t {{
+struct __align__(OP_ALIGNMENT) {0}_state_t {{
   char data[OP_SIZE];
 }};
-struct output_iterator_proxy_t {{
-  __device__ output_iterator_proxy_t operator=(VALUE_T x) {{
+struct {0}_proxy_t {{
+  __device__ {0}_proxy_t operator=(VALUE_T x) {{
     DEREF(&state, x);
     return *this;
   }}
-  output_iterator_state_t state;
+  {0}_state_t state;
 }};
 struct {0} {{
   using iterator_category = cuda::std::random_access_iterator_tag;
   using difference_type   = DIFF_T;
   using value_type        = void;
-  using pointer           = output_iterator_proxy_t*;
-  using reference         = output_iterator_proxy_t;
-  __device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}
+  using pointer           = {0}_proxy_t*;
+  using reference         = {0}_proxy_t;
+  __device__ {0}_proxy_t operator*() const {{ return {{state}}; }}
   __device__ {0}& operator+=(difference_type diff) {{
       ADVANCE(&state, diff);
       return *this;
   }}
-  __device__ output_iterator_proxy_t operator[](difference_type diff) const {{
+  __device__ {0}_proxy_t operator[](difference_type diff) const {{
     {0} result = *this;
     result += diff;
     return {{ result.state }};
@@ -128,7 +128,7 @@ struct {0} {{
     result += diff;
     return result;
   }}
-  output_iterator_state_t state;
+  {0}_state_t state;
 }};
 )XXX",
                                            iterator_name);
 
@@ -18,15 +18,16 @@
 #include "kernels/operators.h"
 #include "util/context.h"
 #include "util/indirect_arg.h"
+#include "util/tuning.h"
 #include "util/types.h"
 #include <cccl/c/merge_sort.h>
 #include <nvrtc/command_list.h>
 #include <nvrtc/ltoir_list_appender.h>
 
 struct op_wrapper;
 struct device_merge_sort_policy;
-using OffsetT = int64_t;
-static_assert(std::is_same_v<cub::detail::choose_signed_offset_t<OffsetT>, OffsetT>, "OffsetT must be int64");
+using OffsetT = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be unsigned long long");
 
 struct input_keys_iterator_state_t;
 struct input_items_iterator_state_t;
@@ -116,11 +117,6 @@ std::string get_iterator_name(cccl_iterator_t iterator, merge_sort_iterator_t wh
   }
 }
 
-int nominal_4b_items_to_items(int nominal_4b_items_per_thread, int key_size)
-{
-  return std::min(nominal_4b_items_per_thread, std::max(1, nominal_4b_items_per_thread * 4 / key_size));
-}
-
 merge_sort_runtime_tuning_policy get_policy(int cc, int key_size)
 {
   merge_sort_tuning_t chain[] = {
@@ -292,7 +288,7 @@ CUresult cccl_device_merge_sort_build(
     const auto input_items_it_value_t  = cccl_type_enum_to_name(input_items_it.value_type.type);
     const auto output_keys_it_value_t  = cccl_type_enum_to_name(output_keys_it.value_type.type);
     const auto output_items_it_value_t = cccl_type_enum_to_name(output_items_it.value_type.type);
-    const auto offset_t                = cccl_type_enum_to_name(cccl_type_enum::CCCL_INT64);
+    const auto offset_t                = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
 
     const std::string input_keys_iterator_src = make_kernel_input_iterator(
       offset_t,
@@ -461,7 +457,7 @@ CUresult cccl_device_merge_sort(
       indirect_arg_t,
       indirect_arg_t,
       indirect_arg_t,
-      ::cuda::std::size_t,
+      OffsetT,
       indirect_arg_t,
       merge_sort::dynamic_merge_sort_policy_t<&merge_sort::get_policy>,
       merge_sort::merge_sort_kernel_source,
 
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cub/agent/single_pass_scan_operators.cuh>
 #include <cub/detail/choose_offset.cuh>
 #include <cub/detail/launcher/cuda_driver.cuh>
 #include <cub/device/dispatch/dispatch_scan.cuh>
@@ -20,7 +19,6 @@
 #include <format>
 #include <iostream>
 #include <optional>
-#include <regex>
 #include <string>
 #include <type_traits>
 
@@ -30,6 +28,7 @@
 #include "util/context.h"
 #include "util/errors.h"
 #include "util/indirect_arg.h"
+#include "util/scan_tile_state.h"
 #include "util/types.h"
 #include <cccl/c/scan.h>
 #include <nvrtc.h>
@@ -172,74 +171,6 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
     init_t); // 9
 }
 
-// TODO: NVRTC doesn't currently support extracting basic type
-// information (e.g., type sizes and alignments) from compiled
-// LTO-IR. So we separately compile a small PTX file that defines the
-// necessary types and constants and grep it for the required
-// information. If/when NVRTC adds these features, we can remove this
-// extra compilation step and get the information directly from the
-// LTO-IR.
-static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
-
-std::optional<size_t> find_size_t(char* ptx, std::string_view name)
-{
-  std::regex regex(std::format(ptx_u64_assignment_regex, name));
-  std::cmatch match;
-  if (std::regex_search(ptx, match, regex))
-  {
-    auto result = std::stoi(match[1].str());
-    return result;
-  }
-  return std::nullopt;
-}
-
-struct scan_tile_state
-{
-  // scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except
-  // that it accepts the acummulator type as a runtime parameter rather than being
-  // templated on it.
-  //
-  // Both specializations ScanTileStateT<T, true> and ScanTileStateT<T, false> - where the
-  // bool parameter indicates whether `T` is primitive - are combined into a single type.
-
-  void* d_tile_status; // d_tile_descriptors
-  void* d_tile_partial;
-  void* d_tile_inclusive;
-
-  size_t description_bytes_per_tile;
-  size_t payload_bytes_per_tile;
-
-  scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile)
-      : d_tile_status(nullptr)
-      , d_tile_partial(nullptr)
-      , d_tile_inclusive(nullptr)
-      , description_bytes_per_tile(description_bytes_per_tile)
-      , payload_bytes_per_tile(payload_bytes_per_tile)
-  {}
-
-  cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
-  {
-    void* allocations[3] = {};
-    auto status          = cub::detail::tile_state_init(
-      description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
-    if (status != cudaSuccess)
-    {
-      return status;
-    }
-    d_tile_status    = allocations[0];
-    d_tile_partial   = allocations[1];
-    d_tile_inclusive = allocations[2];
-    return cudaSuccess;
-  }
-
-  cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
-  {
-    temp_storage_bytes =
-      cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
-    return cudaSuccess;
-  }
-};
-
 template <auto* GetPolicy>
 struct dynamic_scan_policy_t
 {
@@ -392,43 +323,8 @@ struct device_scan_policy {{
     check(cuLibraryGetKernel(&build_ptr->init_kernel, build_ptr->library, init_kernel_lowered_name.c_str()));
     check(cuLibraryGetKernel(&build_ptr->scan_kernel, build_ptr->library, scan_kernel_lowered_name.c_str()));
 
-    constexpr size_t num_ptx_args      = 7;
-    const char* ptx_args[num_ptx_args] = {
-      arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
-    constexpr size_t num_ptx_lto_args       = 3;
-    const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
-
-    constexpr std::string_view ptx_src_template = R"XXX(
-#include <cub/agent/single_pass_scan_operators.cuh>
-#include <cub/util_type.cuh>
-struct __align__({1}) storage_t {{
-   char data[{0}];
-}};
-__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;
-__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;
-)XXX";
-
-    const std::string ptx_src = std::format(ptx_src_template, accum_t.size, accum_t.alignment, accum_cpp);
-    auto compile_result =
-      make_nvrtc_command_list()
-        .add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"})
-        .compile_program({ptx_args, num_ptx_args})
-        .cleanup_program()
-        .finalize_program(num_ptx_lto_args, ptx_lopts);
-    auto ptx_code = compile_result.data.get();
-
-    size_t description_bytes_per_tile;
-    size_t payload_bytes_per_tile;
-    auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
-    if (maybe_description_bytes_per_tile)
-    {
-      description_bytes_per_tile = maybe_description_bytes_per_tile.value();
-    }
-    else
-    {
-      throw std::runtime_error("Failed to find description_bytes_per_tile in PTX");
-    }
-    payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0);
+    auto [description_bytes_per_tile,
+          payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args, num_args, arch);
 
     build_ptr->cc                         = cc;
     build_ptr->cubin                      = (void*) result.data.release();