Skip to content

Commit 16ac5be

Browse files
authored
[STF] Rename the redux access mode into relaxed (NVIDIA#2776)
* Rename the redux access mode into relaxed * change redux to relaxed in the documentation * clang-format
1 parent 4416e8a commit 16ac5be

19 files changed

+48
-47
lines changed

cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh

+3-3
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
137137
const data_place& dplace = it->get_dplace() == data_place::affine ? get_affine_data_place() : it->get_dplace();
138138

139139
const instance_id_t instance_id =
140-
mode == access_mode::redux ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);
140+
mode == access_mode::relaxed ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);
141141

142-
if (mode == access_mode::redux)
142+
if (mode == access_mode::relaxed)
143143
{
144144
d.get_data_instance(instance_id).set_redux_op(it->get_redux_op());
145145
}
@@ -187,7 +187,7 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
187187
{
188188
logical_data_untyped d = e.get_data();
189189

190-
if (e.get_access_mode() == access_mode::redux)
190+
if (e.get_access_mode() == access_mode::relaxed)
191191
{
192192
// Save the last task accessing the instance in with a relaxed coherency mode
193193
d.get_data_instance(e.get_instance_id()).set_last_task_relaxed(*this);

cudax/include/cuda/experimental/__stf/internal/constants.cuh

+9-9
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ namespace cuda::experimental::stf
3535
*/
3636
enum class access_mode : unsigned int
3737
{
38-
none = 0,
39-
read = 1,
40-
write = 2,
41-
rw = 3, // READ + WRITE
42-
redux = 4, /* operator ? */
38+
none = 0,
39+
read = 1,
40+
write = 2,
41+
rw = 3, // READ + WRITE
42+
relaxed = 4, /* operator ? */
4343
};
4444

4545
/**
@@ -50,8 +50,8 @@ inline access_mode operator|(access_mode lhs, access_mode rhs)
5050
{
5151
assert(as_underlying(lhs) < 16);
5252
assert(as_underlying(rhs) < 16);
53-
EXPECT(lhs != access_mode::redux);
54-
EXPECT(rhs != access_mode::redux);
53+
EXPECT(lhs != access_mode::relaxed);
54+
EXPECT(rhs != access_mode::relaxed);
5555
return access_mode(as_underlying(lhs) | as_underlying(rhs));
5656
}
5757

@@ -75,8 +75,8 @@ inline const char* access_mode_string(access_mode mode)
7575
return "rw";
7676
case access_mode::write:
7777
return "write";
78-
case access_mode::redux:
79-
return "redux"; // op ?
78+
case access_mode::relaxed:
79+
return "relaxed"; // op ?
8080
default:
8181
assert(false);
8282
abort();

cudax/include/cuda/experimental/__stf/internal/logical_data.cuh

+13-13
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ public:
242242

243243
reserved::logical_data_state state;
244244

245-
// For temporary or redux accesses, we need to be able to find an available entry
245+
// For temporary or relaxed accesses, we need to be able to find an available entry
246246
::std::vector<data_instance> used_instances;
247247

248248
// A string useful for debugging purpose
@@ -917,7 +917,7 @@ public:
917917
break;
918918
}
919919

920-
case access_mode::redux:
920+
case access_mode::relaxed:
921921
current_instance.set_msir(reserved::msir_state_id::reduction);
922922
break;
923923
default:
@@ -1238,9 +1238,9 @@ public:
12381238
return task_dep_untyped(*this, access_mode::rw, mv(dp));
12391239
}
12401240

1241-
task_dep_untyped redux(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)
1241+
task_dep_untyped relaxed(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)
12421242
{
1243-
return task_dep_untyped(*this, access_mode::redux, mv(dp), op);
1243+
return task_dep_untyped(*this, access_mode::relaxed, mv(dp), op);
12441244
}
12451245

12461246
///@}
@@ -1732,7 +1732,7 @@ inline void reserved::logical_data_untyped_impl::erase()
17321732
assert(ref_id != instance_id_t::invalid);
17331733

17341734
// Get the state in which we store previous writer, readers, ...
1735-
if (h_state.current_mode == access_mode::redux)
1735+
if (h_state.current_mode == access_mode::relaxed)
17361736
{
17371737
// Reconstruction of the data on the reference data place needed
17381738

@@ -1863,14 +1863,14 @@ inline event_list enforce_stf_deps_before(
18631863
auto& dot = *bctx.get_dot();
18641864
const bool dot_is_tracing = dot.is_tracing();
18651865

1866-
if (mode == access_mode::redux)
1866+
if (mode == access_mode::relaxed)
18671867
{
18681868
// A reduction only needs to wait for previous accesses on the data instance
1869-
ctx_.current_mode = access_mode::redux;
1869+
ctx_.current_mode = access_mode::relaxed;
18701870

18711871
if (dot_is_tracing)
18721872
{
1873-
// Add this task to the list of task accessing the logical data in redux mode
1873+
// Add this task to the list of task accessing the logical data in relaxed mode
18741874
// We only store its id since this is used for dot
18751875
ctx_.pending_redux_id.push_back(task.get_unique_id());
18761876
}
@@ -1882,13 +1882,13 @@ inline event_list enforce_stf_deps_before(
18821882
}
18831883

18841884
// This is not a reduction, but perhaps we need to reconstruct the data first?
1885-
if (ctx_.current_mode == access_mode::redux)
1885+
if (ctx_.current_mode == access_mode::relaxed)
18861886
{
18871887
assert(eplace.has_value());
18881888
if (dot_is_tracing)
18891889
{
18901890
// Add a dependency between previous tasks accessing the handle
1891-
// in redux mode, and this task which forces its
1891+
// in relaxed mode, and this task which forces its
18921892
// reconstruction.
18931893
for (const int redux_task_id : ctx_.pending_redux_id)
18941894
{
@@ -1998,7 +1998,7 @@ inline event_list enforce_stf_deps_before(
19981998
template <typename task_type>
19991999
inline void enforce_stf_deps_after(logical_data_untyped& handle, const task_type& task, const access_mode mode)
20002000
{
2001-
if (mode == access_mode::redux)
2001+
if (mode == access_mode::relaxed)
20022002
{
20032003
// no further action is required
20042004
return;
@@ -2295,9 +2295,9 @@ public:
22952295
}
22962296

22972297
template <typename... Pack>
2298-
task_dep<T> redux(Pack&&... pack)
2298+
task_dep<T> relaxed(Pack&&... pack)
22992299
{
2300-
return task_dep<T>(*this, access_mode::redux, ::std::forward<Pack>(pack)...);
2300+
return task_dep<T>(*this, access_mode::relaxed, ::std::forward<Pack>(pack)...);
23012301
}
23022302
///@}
23032303
};

cudax/include/cuda/experimental/__stf/internal/task.cuh

+1-1
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ void dep_allocate(
448448
}
449449

450450
// After allocating a reduction instance, we need to initialize it
451-
if (mode == access_mode::redux)
451+
if (mode == access_mode::relaxed)
452452
{
453453
assert(eplace.has_value());
454454
// We have just allocated a new piece of data to perform

cudax/test/stf/examples/07-cholesky-redux.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ void DGEMM(
263263

264264
// If beta == 1.0 (we assume this is exactly 1.0), then this operation is
265265
// an accumulation with the add operator
266-
auto dep_c = (beta == 1.0) ? C.handle(C_row, C_col).redux(redux_op) : C.handle(C_row, C_col).rw();
266+
auto dep_c = (beta == 1.0) ? C.handle(C_row, C_col).relaxed(redux_op) : C.handle(C_row, C_col).rw();
267267
auto t = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)),
268268
A.handle(A_row, A_col).read(),
269269
B.handle(B_row, B_col).read(),

cudax/test/stf/hashtable/fusion_reduction.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ int main()
9898

9999
for (size_t dev_id = 0; dev_id < 4; dev_id++)
100100
{
101-
ctx.task(h_handle.redux(fusion_op))->*[&](auto stream, auto h) {
101+
ctx.task(h_handle.relaxed(fusion_op))->*[&](auto stream, auto h) {
102102
EXPECT(h.get_capacity() == 2048);
103103
fill_table<<<32, 32, 0, stream>>>(dev_id, 10, h);
104104
};

cudax/test/stf/reductions/many_inc.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main()
3535
for (int i = 0; i < K; i++)
3636
{
3737
// Increment the variable by 1
38-
ctx.task(exec_place::device(i % ndevs), handle.redux(redux_op))->*[](auto stream, auto s) {
38+
ctx.task(exec_place::device(i % ndevs), handle.relaxed(redux_op))->*[](auto stream, auto s) {
3939
add<<<1, 1, 0, stream>>>(s.data_handle());
4040
};
4141
}

cudax/test/stf/reductions/redux_test.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ int main()
2525
int a = 17;
2626
auto handle = ctx.logical_data(make_slice(&a, 1));
2727
auto redux_op = std::make_shared<slice_reduction_op_sum<int>>();
28-
ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) {
28+
ctx.task(handle.relaxed(redux_op))->*[](auto stream, auto s) {
2929
add<<<1, 1, 0, stream>>>(s.data_handle(), 42);
3030
};
3131

cudax/test/stf/reductions/redux_test2.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ int main()
5858
};
5959

6060
// REDUX dev1 (18 + 42)
61-
ctx.task(exec_place::device(1), handle.redux(redux_op))->*[](auto stream, auto s) {
61+
ctx.task(exec_place::device(1), handle.relaxed(redux_op))->*[](auto stream, auto s) {
6262
add<<<1, 1, 0, stream>>>(s.data_handle(), 42);
6363
};
6464

cudax/test/stf/reductions/slice2d_reduction.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ int main()
3838

3939
auto redux_op = std::make_shared<slice_reduction_op_sum<int, 2>>();
4040

41-
ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) {
41+
ctx.task(handle.relaxed(redux_op))->*[](auto stream, auto s) {
4242
add<<<32, 32, 0, stream>>>(s, 42);
4343
};
4444

cudax/test/stf/reductions/slice_custom_op.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ int main()
5151
auto op = std::make_shared<slice_reduction_op<bool, 1, OR_op>>();
5252

5353
// C |= A
54-
ctx.task(lC.redux(op), lA.read())->*[](auto stream, auto sC, auto sA) {
54+
ctx.task(lC.relaxed(op), lA.read())->*[](auto stream, auto sC, auto sA) {
5555
cudaMemcpyAsync(sC.data_handle(), sA.data_handle(), sA.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream);
5656
};
5757

5858
// C |= B
59-
ctx.task(lC.redux(op), lB.read())->*[](auto stream, auto sC, auto sB) {
59+
ctx.task(lC.relaxed(op), lB.read())->*[](auto stream, auto sC, auto sB) {
6060
cudaMemcpyAsync(sC.data_handle(), sB.data_handle(), sB.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream);
6161
};
6262

cudax/test/stf/reductions/successive_reductions.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ int main()
5050
// We add i (total = N(N-1)/2 + initial_value)
5151
for (int i = 0; i < N; i++)
5252
{
53-
ctx.task(var_handle.redux(redux_op))->*[=](cudaStream_t stream, auto d_var) {
53+
ctx.task(var_handle.relaxed(redux_op))->*[=](cudaStream_t stream, auto d_var) {
5454
add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);
5555
cuda_safe_call(cudaGetLastError());
5656
};

cudax/test/stf/reductions/successive_reductions_pfor.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ int main()
2929
// We add i (total = N(N-1)/2 + initial_value)
3030
for (int i = 0; i < N; i++)
3131
{
32-
ctx.parallel_for(var_handle.shape(), var_handle.redux(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {
32+
ctx.parallel_for(var_handle.shape(), var_handle.relaxed(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {
3333
atomicAdd(d_var.data_handle(), i);
3434
};
3535
}

cudax/test/stf/reductions/sum.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ int main()
9292
// We add i (total = N(N-1)/2 + initial_value)
9393
for (int i = 0; i < N; i++)
9494
{
95-
ctx.task(var_handle.redux(redux_op))->*[&](cudaStream_t stream, auto d_var) {
95+
ctx.task(var_handle.relaxed(redux_op))->*[&](cudaStream_t stream, auto d_var) {
9696
add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);
9797
};
9898
}

cudax/test/stf/reductions/sum_array.cu

+4-3
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,10 @@ int main()
9898

9999
for (int i = 0; i < N; i++)
100100
{
101-
ctx.task(var_handle.redux(redux_op), array_handles[i].read())->*[](cudaStream_t stream, auto d_var, auto d_array_i) {
102-
add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());
103-
};
101+
ctx.task(var_handle.relaxed(redux_op), array_handles[i].read())
102+
->*[](cudaStream_t stream, auto d_var, auto d_array_i) {
103+
add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());
104+
};
104105
}
105106

106107
// Force the reconstruction of data on the device, so that no transfers are

cudax/test/stf/reductions/sum_multiple_places.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ int main()
4141
// device
4242
for (int d = 0; d < ndevs; d++)
4343
{
44-
ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) {
44+
ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[=](cudaStream_t s, auto var) {
4545
add_val<int><<<1, 1, 0, s>>>(var, i);
4646
};
4747
}
4848

4949
// host
50-
ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) {
50+
ctx.host_launch(var_handle.relaxed(redux_op))->*[=](auto var) {
5151
var(0) += i;
5252
};
5353
}

cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,13 @@ int main()
9494
// device
9595
for (int d = 0; d < ndevs; d++)
9696
{
97-
ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) {
97+
ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[&](cudaStream_t s, auto var) {
9898
add_val<int><<<1, 1, 0, s>>>(var.data_handle(), i);
9999
};
100100
}
101101

102102
// host
103-
ctx.task(exec_place::host, var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) {
103+
ctx.task(exec_place::host, var_handle.relaxed(redux_op))->*[&](cudaStream_t s, auto var) {
104104
cuda_safe_call(cudaStreamSynchronize(s));
105105
*var.data_handle() += i;
106106
};

cudax/test/stf/reductions/write_back_after_redux.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,13 @@ int main()
4949
// device
5050
for (int d = 0; d < ndevs; d++)
5151
{
52-
ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) {
52+
ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[=](cudaStream_t s, auto var) {
5353
add_val<int><<<1, 1, 0, s>>>(var, i);
5454
};
5555
}
5656

5757
// host
58-
ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) {
58+
ctx.host_launch(var_handle.relaxed(redux_op))->*[=](auto var) {
5959
var(0) += i;
6060
};
6161
}

docs/cudax/stf.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ write-only access (using the ``write()`` member of ``lX``). A write-only
558558
access will indeed allocate ``lX`` at the appropriate location, but it
559559
will not try to load a valid copy of it prior to executing the task.
560560

561-
Using other access modes such as ``read()``, ``redux()`` or ``rw()``
561+
Using other access modes such as ``read()``, ``relaxed()`` or ``rw()``
562562
that attempt to provide a valid instance will result in an error.
563563

564564
Similarly, it is possible to define a logical data from a slice shapes

0 commit comments

Comments
 (0)