Skip to content

Commit 84889c6

Browse files
authored
Add Unique by Key Implementation for c.parallel (NVIDIA#3947)
* Move nominal_4b_items_to_items to separate file for reuse * Use correct offset type for merge sort and unique by key
1 parent 80f98b7 commit 84889c6

15 files changed

+1255
-320
lines changed
+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA Core Compute Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#pragma once
12+
13+
#ifndef CCCL_C_EXPERIMENTAL
14+
# error "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this notice."
15+
#endif // !CCCL_C_EXPERIMENTAL
16+
17+
#include <cuda.h>
18+
19+
#include <cccl/c/extern_c.h>
20+
#include <cccl/c/types.h>
21+
22+
CCCL_C_EXTERN_C_BEGIN
23+
24+
typedef struct cccl_device_unique_by_key_build_result_t
25+
{
26+
int cc;
27+
void* cubin;
28+
size_t cubin_size;
29+
CUlibrary library;
30+
CUkernel compact_init_kernel;
31+
CUkernel sweep_kernel;
32+
size_t description_bytes_per_tile;
33+
size_t payload_bytes_per_tile;
34+
} cccl_device_unique_by_key_build_result_t;
35+
36+
CCCL_C_API CUresult cccl_device_unique_by_key_build(
37+
cccl_device_unique_by_key_build_result_t* build,
38+
cccl_iterator_t d_keys_in,
39+
cccl_iterator_t d_values_in,
40+
cccl_iterator_t d_keys_out,
41+
cccl_iterator_t d_values_out,
42+
cccl_iterator_t d_num_selected_out,
43+
cccl_op_t op,
44+
int cc_major,
45+
int cc_minor,
46+
const char* cub_path,
47+
const char* thrust_path,
48+
const char* libcudacxx_path,
49+
const char* ctk_path) noexcept;
50+
51+
CCCL_C_API CUresult cccl_device_unique_by_key(
52+
cccl_device_unique_by_key_build_result_t build,
53+
void* d_temp_storage,
54+
size_t* temp_storage_bytes,
55+
cccl_iterator_t d_keys_in,
56+
cccl_iterator_t d_values_in,
57+
cccl_iterator_t d_keys_out,
58+
cccl_iterator_t d_values_out,
59+
cccl_iterator_t d_num_selected_out,
60+
cccl_op_t op,
61+
unsigned long long num_items,
62+
CUstream stream) noexcept;
63+
64+
CCCL_C_API CUresult cccl_device_unique_by_key_cleanup(cccl_device_unique_by_key_build_result_t* bld_ptr) noexcept;
65+
66+
CCCL_C_EXTERN_C_END

c/parallel/src/kernels/iterators.cpp

+9-9
Original file line numberDiff line numberDiff line change
@@ -97,28 +97,28 @@ std::string make_kernel_output_iterator(
9797
const std::string iter_def = std::format(R"XXX(
9898
extern "C" __device__ void DEREF(const void *self_ptr, VALUE_T x);
9999
extern "C" __device__ void ADVANCE(void *self_ptr, DIFF_T offset);
100-
struct __align__(OP_ALIGNMENT) output_iterator_state_t {{
100+
struct __align__(OP_ALIGNMENT) {0}_state_t {{
101101
char data[OP_SIZE];
102102
}};
103-
struct output_iterator_proxy_t {{
104-
__device__ output_iterator_proxy_t operator=(VALUE_T x) {{
103+
struct {0}_proxy_t {{
104+
__device__ {0}_proxy_t operator=(VALUE_T x) {{
105105
DEREF(&state, x);
106106
return *this;
107107
}}
108-
output_iterator_state_t state;
108+
{0}_state_t state;
109109
}};
110110
struct {0} {{
111111
using iterator_category = cuda::std::random_access_iterator_tag;
112112
using difference_type = DIFF_T;
113113
using value_type = void;
114-
using pointer = output_iterator_proxy_t*;
115-
using reference = output_iterator_proxy_t;
116-
__device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}
114+
using pointer = {0}_proxy_t*;
115+
using reference = {0}_proxy_t;
116+
__device__ {0}_proxy_t operator*() const {{ return {{state}}; }}
117117
__device__ {0}& operator+=(difference_type diff) {{
118118
ADVANCE(&state, diff);
119119
return *this;
120120
}}
121-
__device__ output_iterator_proxy_t operator[](difference_type diff) const {{
121+
__device__ {0}_proxy_t operator[](difference_type diff) const {{
122122
{0} result = *this;
123123
result += diff;
124124
return {{ result.state }};
@@ -128,7 +128,7 @@ struct {0} {{
128128
result += diff;
129129
return result;
130130
}}
131-
output_iterator_state_t state;
131+
{0}_state_t state;
132132
}};
133133
)XXX",
134134
iterator_name);

c/parallel/src/merge_sort.cu

+5-9
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@
1818
#include "kernels/operators.h"
1919
#include "util/context.h"
2020
#include "util/indirect_arg.h"
21+
#include "util/tuning.h"
2122
#include "util/types.h"
2223
#include <cccl/c/merge_sort.h>
2324
#include <nvrtc/command_list.h>
2425
#include <nvrtc/ltoir_list_appender.h>
2526

2627
struct op_wrapper;
2728
struct device_merge_sort_policy;
28-
using OffsetT = int64_t;
29-
static_assert(std::is_same_v<cub::detail::choose_signed_offset_t<OffsetT>, OffsetT>, "OffsetT must be int64");
29+
using OffsetT = unsigned long long;
30+
static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be unsigned long long");
3031

3132
struct input_keys_iterator_state_t;
3233
struct input_items_iterator_state_t;
@@ -116,11 +117,6 @@ std::string get_iterator_name(cccl_iterator_t iterator, merge_sort_iterator_t wh
116117
}
117118
}
118119

119-
int nominal_4b_items_to_items(int nominal_4b_items_per_thread, int key_size)
120-
{
121-
return std::min(nominal_4b_items_per_thread, std::max(1, nominal_4b_items_per_thread * 4 / key_size));
122-
}
123-
124120
merge_sort_runtime_tuning_policy get_policy(int cc, int key_size)
125121
{
126122
merge_sort_tuning_t chain[] = {
@@ -292,7 +288,7 @@ CUresult cccl_device_merge_sort_build(
292288
const auto input_items_it_value_t = cccl_type_enum_to_name(input_items_it.value_type.type);
293289
const auto output_keys_it_value_t = cccl_type_enum_to_name(output_keys_it.value_type.type);
294290
const auto output_items_it_value_t = cccl_type_enum_to_name(output_items_it.value_type.type);
295-
const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_INT64);
291+
const auto offset_t = cccl_type_enum_to_name(cccl_type_enum::CCCL_UINT64);
296292

297293
const std::string input_keys_iterator_src = make_kernel_input_iterator(
298294
offset_t,
@@ -461,7 +457,7 @@ CUresult cccl_device_merge_sort(
461457
indirect_arg_t,
462458
indirect_arg_t,
463459
indirect_arg_t,
464-
::cuda::std::size_t,
460+
OffsetT,
465461
indirect_arg_t,
466462
merge_sort::dynamic_merge_sort_policy_t<&merge_sort::get_policy>,
467463
merge_sort::merge_sort_kernel_source,

c/parallel/src/scan.cu

+3-107
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
//
99
//===----------------------------------------------------------------------===//
1010

11-
#include <cub/agent/single_pass_scan_operators.cuh>
1211
#include <cub/detail/choose_offset.cuh>
1312
#include <cub/detail/launcher/cuda_driver.cuh>
1413
#include <cub/device/dispatch/dispatch_scan.cuh>
@@ -20,7 +19,6 @@
2019
#include <format>
2120
#include <iostream>
2221
#include <optional>
23-
#include <regex>
2422
#include <string>
2523
#include <type_traits>
2624

@@ -30,6 +28,7 @@
3028
#include "util/context.h"
3129
#include "util/errors.h"
3230
#include "util/indirect_arg.h"
31+
#include "util/scan_tile_state.h"
3332
#include "util/types.h"
3433
#include <cccl/c/scan.h>
3534
#include <nvrtc.h>
@@ -172,74 +171,6 @@ std::string get_scan_kernel_name(cccl_iterator_t input_it, cccl_iterator_t outpu
172171
init_t); // 9
173172
}
174173

175-
// TODO: NVRTC doesn't currently support extracting basic type
176-
// information (e.g., type sizes and alignments) from compiled
177-
// LTO-IR. So we separately compile a small PTX file that defines the
178-
// necessary types and constants and grep it for the required
179-
// information. If/when NVRTC adds these features, we can remove this
180-
// extra compilation step and get the information directly from the
181-
// LTO-IR.
182-
static constexpr auto ptx_u64_assignment_regex = R"(\.visible\s+\.global\s+\.align\s+\d+\s+\.u64\s+{}\s*=\s*(\d+);)";
183-
184-
std::optional<size_t> find_size_t(char* ptx, std::string_view name)
185-
{
186-
std::regex regex(std::format(ptx_u64_assignment_regex, name));
187-
std::cmatch match;
188-
if (std::regex_search(ptx, match, regex))
189-
{
190-
auto result = std::stoi(match[1].str());
191-
return result;
192-
}
193-
return std::nullopt;
194-
}
195-
196-
struct scan_tile_state
197-
{
198-
// scan_tile_state implements the same (host) interface as cub::ScanTileStateT, except
199-
// that it accepts the acummulator type as a runtime parameter rather than being
200-
// templated on it.
201-
//
202-
// Both specializations ScanTileStateT<T, true> and ScanTileStateT<T, false> - where the
203-
// bool parameter indicates whether `T` is primitive - are combined into a single type.
204-
205-
void* d_tile_status; // d_tile_descriptors
206-
void* d_tile_partial;
207-
void* d_tile_inclusive;
208-
209-
size_t description_bytes_per_tile;
210-
size_t payload_bytes_per_tile;
211-
212-
scan_tile_state(size_t description_bytes_per_tile, size_t payload_bytes_per_tile)
213-
: d_tile_status(nullptr)
214-
, d_tile_partial(nullptr)
215-
, d_tile_inclusive(nullptr)
216-
, description_bytes_per_tile(description_bytes_per_tile)
217-
, payload_bytes_per_tile(payload_bytes_per_tile)
218-
{}
219-
220-
cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
221-
{
222-
void* allocations[3] = {};
223-
auto status = cub::detail::tile_state_init(
224-
description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
225-
if (status != cudaSuccess)
226-
{
227-
return status;
228-
}
229-
d_tile_status = allocations[0];
230-
d_tile_partial = allocations[1];
231-
d_tile_inclusive = allocations[2];
232-
return cudaSuccess;
233-
}
234-
235-
cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes) const
236-
{
237-
temp_storage_bytes =
238-
cub::detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
239-
return cudaSuccess;
240-
}
241-
};
242-
243174
template <auto* GetPolicy>
244175
struct dynamic_scan_policy_t
245176
{
@@ -392,43 +323,8 @@ struct device_scan_policy {{
392323
check(cuLibraryGetKernel(&build_ptr->init_kernel, build_ptr->library, init_kernel_lowered_name.c_str()));
393324
check(cuLibraryGetKernel(&build_ptr->scan_kernel, build_ptr->library, scan_kernel_lowered_name.c_str()));
394325

395-
constexpr size_t num_ptx_args = 7;
396-
const char* ptx_args[num_ptx_args] = {
397-
arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
398-
constexpr size_t num_ptx_lto_args = 3;
399-
const char* ptx_lopts[num_ptx_lto_args] = {"-lto", arch.c_str(), "-ptx"};
400-
401-
constexpr std::string_view ptx_src_template = R"XXX(
402-
#include <cub/agent/single_pass_scan_operators.cuh>
403-
#include <cub/util_type.cuh>
404-
struct __align__({1}) storage_t {{
405-
char data[{0}];
406-
}};
407-
__device__ size_t description_bytes_per_tile = cub::ScanTileState<{2}>::description_bytes_per_tile;
408-
__device__ size_t payload_bytes_per_tile = cub::ScanTileState<{2}>::payload_bytes_per_tile;
409-
)XXX";
410-
411-
const std::string ptx_src = std::format(ptx_src_template, accum_t.size, accum_t.alignment, accum_cpp);
412-
auto compile_result =
413-
make_nvrtc_command_list()
414-
.add_program(nvrtc_translation_unit{ptx_src.c_str(), "tile_state_info"})
415-
.compile_program({ptx_args, num_ptx_args})
416-
.cleanup_program()
417-
.finalize_program(num_ptx_lto_args, ptx_lopts);
418-
auto ptx_code = compile_result.data.get();
419-
420-
size_t description_bytes_per_tile;
421-
size_t payload_bytes_per_tile;
422-
auto maybe_description_bytes_per_tile = scan::find_size_t(ptx_code, "description_bytes_per_tile");
423-
if (maybe_description_bytes_per_tile)
424-
{
425-
description_bytes_per_tile = maybe_description_bytes_per_tile.value();
426-
}
427-
else
428-
{
429-
throw std::runtime_error("Failed to find description_bytes_per_tile in PTX");
430-
}
431-
payload_bytes_per_tile = scan::find_size_t(ptx_code, "payload_bytes_per_tile").value_or(0);
326+
auto [description_bytes_per_tile,
327+
payload_bytes_per_tile] = get_tile_state_bytes_per_tile(accum_t, accum_cpp, args, num_args, arch);
432328

433329
build_ptr->cc = cc;
434330
build_ptr->cubin = (void*) result.data.release();

0 commit comments

Comments
 (0)