[STF] Rename the redux access mode into relaxed (NVIDIA#2776)

caugonnet · web-flow · commit 16ac5bec113b · 2024-11-12T15:25:32.000+01:00
* Rename the redux access mode into relaxed

* change redux to relaxed in the documentation

* clang-format
diff --git a/cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh b/cudax/include/cuda/experimental/__stf/internal/acquire_release.cuh
@@ -137,9 +137,9 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
     const data_place& dplace = it->get_dplace() == data_place::affine ? get_affine_data_place() : it->get_dplace();
 
     const instance_id_t instance_id =
-      mode == access_mode::redux ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);
+      mode == access_mode::relaxed ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);
 
-    if (mode == access_mode::redux)
+    if (mode == access_mode::relaxed)
     {
       d.get_data_instance(instance_id).set_redux_op(it->get_redux_op());
     }
@@ -187,7 +187,7 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
   {
     logical_data_untyped d = e.get_data();
 
-    if (e.get_access_mode() == access_mode::redux)
+    if (e.get_access_mode() == access_mode::relaxed)
     {
       // Save the last task accessing the instance in with a relaxed coherency mode
       d.get_data_instance(e.get_instance_id()).set_last_task_relaxed(*this);
diff --git a/cudax/include/cuda/experimental/__stf/internal/constants.cuh b/cudax/include/cuda/experimental/__stf/internal/constants.cuh
@@ -35,11 +35,11 @@ namespace cuda::experimental::stf
  */
 enum class access_mode : unsigned int
 {
-  none  = 0,
-  read  = 1,
-  write = 2,
-  rw    = 3, // READ + WRITE
-  redux = 4, /* operator ? */
+  none    = 0,
+  read    = 1,
+  write   = 2,
+  rw      = 3, // READ + WRITE
+  relaxed = 4, /* operator ? */
 };
 
 /**
@@ -50,8 +50,8 @@ inline access_mode operator|(access_mode lhs, access_mode rhs)
 {
   assert(as_underlying(lhs) < 16);
   assert(as_underlying(rhs) < 16);
-  EXPECT(lhs != access_mode::redux);
-  EXPECT(rhs != access_mode::redux);
+  EXPECT(lhs != access_mode::relaxed);
+  EXPECT(rhs != access_mode::relaxed);
   return access_mode(as_underlying(lhs) | as_underlying(rhs));
 }
 
@@ -75,8 +75,8 @@ inline const char* access_mode_string(access_mode mode)
       return "rw";
     case access_mode::write:
       return "write";
-    case access_mode::redux:
-      return "redux"; // op ?
+    case access_mode::relaxed:
+      return "relaxed"; // op ?
     default:
       assert(false);
       abort();
diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
@@ -242,7 +242,7 @@ public:
 
   reserved::logical_data_state state;
 
-  // For temporary or redux accesses, we need to be able to find an available entry
+  // For temporary or relaxed accesses, we need to be able to find an available entry
   ::std::vector<data_instance> used_instances;
 
   // A string useful for debugging purpose
@@ -917,7 +917,7 @@ public:
         break;
       }
 
-      case access_mode::redux:
+      case access_mode::relaxed:
         current_instance.set_msir(reserved::msir_state_id::reduction);
         break;
       default:
@@ -1238,9 +1238,9 @@ public:
     return task_dep_untyped(*this, access_mode::rw, mv(dp));
   }
 
-  task_dep_untyped redux(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)
+  task_dep_untyped relaxed(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)
   {
-    return task_dep_untyped(*this, access_mode::redux, mv(dp), op);
+    return task_dep_untyped(*this, access_mode::relaxed, mv(dp), op);
   }
 
   ///@}
@@ -1732,7 +1732,7 @@ inline void reserved::logical_data_untyped_impl::erase()
     assert(ref_id != instance_id_t::invalid);
 
     // Get the state in which we store previous writer, readers, ...
-    if (h_state.current_mode == access_mode::redux)
+    if (h_state.current_mode == access_mode::relaxed)
     {
       // Reconstruction of the data on the reference data place needed
 
@@ -1863,14 +1863,14 @@ inline event_list enforce_stf_deps_before(
   auto& dot                 = *bctx.get_dot();
   const bool dot_is_tracing = dot.is_tracing();
 
-  if (mode == access_mode::redux)
+  if (mode == access_mode::relaxed)
   {
     // A reduction only needs to wait for previous accesses on the data instance
-    ctx_.current_mode = access_mode::redux;
+    ctx_.current_mode = access_mode::relaxed;
 
     if (dot_is_tracing)
     {
-      // Add this task to the list of task accessing the logical data in redux mode
+      // Add this task to the list of task accessing the logical data in relaxed mode
       // We only store its id since this is used for dot
       ctx_.pending_redux_id.push_back(task.get_unique_id());
     }
@@ -1882,13 +1882,13 @@ inline event_list enforce_stf_deps_before(
   }
 
   // This is not a reduction, but perhaps we need to reconstruct the data first?
-  if (ctx_.current_mode == access_mode::redux)
+  if (ctx_.current_mode == access_mode::relaxed)
   {
     assert(eplace.has_value());
     if (dot_is_tracing)
     {
       // Add a dependency between previous tasks accessing the handle
-      // in redux mode, and this task which forces its
+      // in relaxed mode, and this task which forces its
       // reconstruction.
       for (const int redux_task_id : ctx_.pending_redux_id)
       {
@@ -1998,7 +1998,7 @@ inline event_list enforce_stf_deps_before(
 template <typename task_type>
 inline void enforce_stf_deps_after(logical_data_untyped& handle, const task_type& task, const access_mode mode)
 {
-  if (mode == access_mode::redux)
+  if (mode == access_mode::relaxed)
   {
     // no further action is required
     return;
@@ -2295,9 +2295,9 @@ public:
   }
 
   template <typename... Pack>
-  task_dep<T> redux(Pack&&... pack)
+  task_dep<T> relaxed(Pack&&... pack)
   {
-    return task_dep<T>(*this, access_mode::redux, ::std::forward<Pack>(pack)...);
+    return task_dep<T>(*this, access_mode::relaxed, ::std::forward<Pack>(pack)...);
   }
   ///@}
 };
diff --git a/cudax/include/cuda/experimental/__stf/internal/task.cuh b/cudax/include/cuda/experimental/__stf/internal/task.cuh
@@ -448,7 +448,7 @@ void dep_allocate(
     }
 
     // After allocating a reduction instance, we need to initialize it
-    if (mode == access_mode::redux)
+    if (mode == access_mode::relaxed)
     {
       assert(eplace.has_value());
       // We have just allocated a new piece of data to perform
diff --git a/cudax/test/stf/examples/07-cholesky-redux.cu b/cudax/test/stf/examples/07-cholesky-redux.cu
@@ -263,7 +263,7 @@ void DGEMM(
 
   // If beta == 1.0 (we assume this is exactly 1.0), then this operation is
   // an accumulation with the add operator
-  auto dep_c = (beta == 1.0) ? C.handle(C_row, C_col).redux(redux_op) : C.handle(C_row, C_col).rw();
+  auto dep_c = (beta == 1.0) ? C.handle(C_row, C_col).relaxed(redux_op) : C.handle(C_row, C_col).rw();
   auto t     = ctx.task(exec_place::device(A.get_preferred_devid(C_row, C_col)),
                     A.handle(A_row, A_col).read(),
                     B.handle(B_row, B_col).read(),
diff --git a/cudax/test/stf/hashtable/fusion_reduction.cu b/cudax/test/stf/hashtable/fusion_reduction.cu
@@ -98,7 +98,7 @@ int main()
 
   for (size_t dev_id = 0; dev_id < 4; dev_id++)
   {
-    ctx.task(h_handle.redux(fusion_op))->*[&](auto stream, auto h) {
+    ctx.task(h_handle.relaxed(fusion_op))->*[&](auto stream, auto h) {
       EXPECT(h.get_capacity() == 2048);
       fill_table<<<32, 32, 0, stream>>>(dev_id, 10, h);
     };
diff --git a/cudax/test/stf/reductions/many_inc.cu b/cudax/test/stf/reductions/many_inc.cu
@@ -35,7 +35,7 @@ int main()
   for (int i = 0; i < K; i++)
   {
     // Increment the variable by 1
-    ctx.task(exec_place::device(i % ndevs), handle.redux(redux_op))->*[](auto stream, auto s) {
+    ctx.task(exec_place::device(i % ndevs), handle.relaxed(redux_op))->*[](auto stream, auto s) {
       add<<<1, 1, 0, stream>>>(s.data_handle());
     };
   }
diff --git a/cudax/test/stf/reductions/redux_test.cu b/cudax/test/stf/reductions/redux_test.cu
@@ -25,7 +25,7 @@ int main()
   int a         = 17;
   auto handle   = ctx.logical_data(make_slice(&a, 1));
   auto redux_op = std::make_shared<slice_reduction_op_sum<int>>();
-  ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) {
+  ctx.task(handle.relaxed(redux_op))->*[](auto stream, auto s) {
     add<<<1, 1, 0, stream>>>(s.data_handle(), 42);
   };
 
diff --git a/cudax/test/stf/reductions/redux_test2.cu b/cudax/test/stf/reductions/redux_test2.cu
@@ -58,7 +58,7 @@ int main()
   };
 
   // REDUX dev1 (18 + 42)
-  ctx.task(exec_place::device(1), handle.redux(redux_op))->*[](auto stream, auto s) {
+  ctx.task(exec_place::device(1), handle.relaxed(redux_op))->*[](auto stream, auto s) {
     add<<<1, 1, 0, stream>>>(s.data_handle(), 42);
   };
 
diff --git a/cudax/test/stf/reductions/slice2d_reduction.cu b/cudax/test/stf/reductions/slice2d_reduction.cu
@@ -38,7 +38,7 @@ int main()
 
   auto redux_op = std::make_shared<slice_reduction_op_sum<int, 2>>();
 
-  ctx.task(handle.redux(redux_op))->*[](auto stream, auto s) {
+  ctx.task(handle.relaxed(redux_op))->*[](auto stream, auto s) {
     add<<<32, 32, 0, stream>>>(s, 42);
   };
 
diff --git a/cudax/test/stf/reductions/slice_custom_op.cu b/cudax/test/stf/reductions/slice_custom_op.cu
@@ -51,12 +51,12 @@ int main()
   auto op = std::make_shared<slice_reduction_op<bool, 1, OR_op>>();
 
   // C |= A
-  ctx.task(lC.redux(op), lA.read())->*[](auto stream, auto sC, auto sA) {
+  ctx.task(lC.relaxed(op), lA.read())->*[](auto stream, auto sC, auto sA) {
     cudaMemcpyAsync(sC.data_handle(), sA.data_handle(), sA.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream);
   };
 
   // C |= B
-  ctx.task(lC.redux(op), lB.read())->*[](auto stream, auto sC, auto sB) {
+  ctx.task(lC.relaxed(op), lB.read())->*[](auto stream, auto sC, auto sB) {
     cudaMemcpyAsync(sC.data_handle(), sB.data_handle(), sB.extent(0) * sizeof(bool), cudaMemcpyDeviceToDevice, stream);
   };
 
diff --git a/cudax/test/stf/reductions/successive_reductions.cu b/cudax/test/stf/reductions/successive_reductions.cu
@@ -50,7 +50,7 @@ int main()
     // We add i (total = N(N-1)/2 + initial_value)
     for (int i = 0; i < N; i++)
     {
-      ctx.task(var_handle.redux(redux_op))->*[=](cudaStream_t stream, auto d_var) {
+      ctx.task(var_handle.relaxed(redux_op))->*[=](cudaStream_t stream, auto d_var) {
         add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);
         cuda_safe_call(cudaGetLastError());
       };
diff --git a/cudax/test/stf/reductions/successive_reductions_pfor.cu b/cudax/test/stf/reductions/successive_reductions_pfor.cu
@@ -29,7 +29,7 @@ int main()
     // We add i (total = N(N-1)/2 + initial_value)
     for (int i = 0; i < N; i++)
     {
-      ctx.parallel_for(var_handle.shape(), var_handle.redux(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {
+      ctx.parallel_for(var_handle.shape(), var_handle.relaxed(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {
         atomicAdd(d_var.data_handle(), i);
       };
     }
diff --git a/cudax/test/stf/reductions/sum.cu b/cudax/test/stf/reductions/sum.cu
@@ -92,7 +92,7 @@ int main()
   // We add i (total = N(N-1)/2 + initial_value)
   for (int i = 0; i < N; i++)
   {
-    ctx.task(var_handle.redux(redux_op))->*[&](cudaStream_t stream, auto d_var) {
+    ctx.task(var_handle.relaxed(redux_op))->*[&](cudaStream_t stream, auto d_var) {
       add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);
     };
   }
diff --git a/cudax/test/stf/reductions/sum_array.cu b/cudax/test/stf/reductions/sum_array.cu
@@ -98,9 +98,10 @@ int main()
 
   for (int i = 0; i < N; i++)
   {
-    ctx.task(var_handle.redux(redux_op), array_handles[i].read())->*[](cudaStream_t stream, auto d_var, auto d_array_i) {
-      add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());
-    };
+    ctx.task(var_handle.relaxed(redux_op), array_handles[i].read())
+        ->*[](cudaStream_t stream, auto d_var, auto d_array_i) {
+              add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());
+            };
   }
 
   // Force the reconstruction of data on the device, so that no transfers are
diff --git a/cudax/test/stf/reductions/sum_multiple_places.cu b/cudax/test/stf/reductions/sum_multiple_places.cu
@@ -41,13 +41,13 @@ int main()
     // device
     for (int d = 0; d < ndevs; d++)
     {
-      ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) {
+      ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[=](cudaStream_t s, auto var) {
         add_val<int><<<1, 1, 0, s>>>(var, i);
       };
     }
 
     // host
-    ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) {
+    ctx.host_launch(var_handle.relaxed(redux_op))->*[=](auto var) {
       var(0) += i;
     };
   }
diff --git a/cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu b/cudax/test/stf/reductions/sum_multiple_places_no_refvalue.cu
@@ -94,13 +94,13 @@ int main()
     // device
     for (int d = 0; d < ndevs; d++)
     {
-      ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) {
+      ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[&](cudaStream_t s, auto var) {
         add_val<int><<<1, 1, 0, s>>>(var.data_handle(), i);
       };
     }
 
     // host
-    ctx.task(exec_place::host, var_handle.redux(redux_op))->*[&](cudaStream_t s, auto var) {
+    ctx.task(exec_place::host, var_handle.relaxed(redux_op))->*[&](cudaStream_t s, auto var) {
       cuda_safe_call(cudaStreamSynchronize(s));
       *var.data_handle() += i;
     };
diff --git a/cudax/test/stf/reductions/write_back_after_redux.cu b/cudax/test/stf/reductions/write_back_after_redux.cu
@@ -49,13 +49,13 @@ int main()
     // device
     for (int d = 0; d < ndevs; d++)
     {
-      ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) {
+      ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[=](cudaStream_t s, auto var) {
         add_val<int><<<1, 1, 0, s>>>(var, i);
       };
     }
 
     // host
-    ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) {
+    ctx.host_launch(var_handle.relaxed(redux_op))->*[=](auto var) {
       var(0) += i;
     };
   }
diff --git a/docs/cudax/stf.rst b/docs/cudax/stf.rst
@@ -558,7 +558,7 @@ write-only access (using the ``write()`` member of ``lX``). A write-only
 access will indeed allocate ``lX`` at the appropriate location, but it
 will not try to load a valid copy of it prior to executing the task.
 
-Using other access modes such as ``read()``, ``redux()`` or ``rw()``
+Using other access modes such as ``read()``, ``relaxed()`` or ``rw()``
 that attempt to provide a valid instance will result in an error.
 
 Similarly, it is possible to define a logical data from a slice shapes

Original file line number	Diff line number	Diff line change
`@@ -137,9 +137,9 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)`
`137`	`137`	`const data_place& dplace = it->get_dplace() == data_place::affine ? get_affine_data_place() : it->get_dplace();`
`138`	`138`
`139`	`139`	`const instance_id_t instance_id =`
`140`		`- mode == access_mode::redux ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);`
	`140`	`+ mode == access_mode::relaxed ? d.find_unused_instance_id(dplace) : d.find_instance_id(dplace);`
`141`	`141`
`142`		`- if (mode == access_mode::redux)`
	`142`	`+ if (mode == access_mode::relaxed)`
`143`	`143`	`{`
`144`	`144`	`d.get_data_instance(instance_id).set_redux_op(it->get_redux_op());`
`145`	`145`	`}`
`@@ -187,7 +187,7 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)`
`187`	`187`	`{`
`188`	`188`	`logical_data_untyped d = e.get_data();`
`189`	`189`
`190`		`- if (e.get_access_mode() == access_mode::redux)`
	`190`	`+ if (e.get_access_mode() == access_mode::relaxed)`
`191`	`191`	`{`
`192`	`192`	`// Save the last task accessing the instance in with a relaxed coherency mode`
`193`	`193`	`d.get_data_instance(e.get_instance_id()).set_last_task_relaxed(*this);`
Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ public:`
`242`	`242`
`243`	`243`	`reserved::logical_data_state state;`
`244`	`244`
`245`		`- // For temporary or redux accesses, we need to be able to find an available entry`
	`245`	`+ // For temporary or relaxed accesses, we need to be able to find an available entry`
`246`	`246`	`::std::vector<data_instance> used_instances;`
`247`	`247`
`248`	`248`	`// A string useful for debugging purpose`
`@@ -917,7 +917,7 @@ public:`
`917`	`917`	`break;`
`918`	`918`	`}`
`919`	`919`
`920`		`- case access_mode::redux:`
	`920`	`+ case access_mode::relaxed:`
`921`	`921`	`current_instance.set_msir(reserved::msir_state_id::reduction);`
`922`	`922`	`break;`
`923`	`923`	`default:`
`@@ -1238,9 +1238,9 @@ public:`
`1238`	`1238`	`return task_dep_untyped(*this, access_mode::rw, mv(dp));`
`1239`	`1239`	`}`
`1240`	`1240`
`1241`		`- task_dep_untyped redux(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)`
	`1241`	`+ task_dep_untyped relaxed(::std::shared_ptr<reduction_operator_base> op, data_place dp = data_place::affine)`
`1242`	`1242`	`{`
`1243`		`- return task_dep_untyped(*this, access_mode::redux, mv(dp), op);`
	`1243`	`+ return task_dep_untyped(*this, access_mode::relaxed, mv(dp), op);`
`1244`	`1244`	`}`
`1245`	`1245`
`1246`	`1246`	`///@}`
`@@ -1732,7 +1732,7 @@ inline void reserved::logical_data_untyped_impl::erase()`
`1732`	`1732`	`assert(ref_id != instance_id_t::invalid);`
`1733`	`1733`
`1734`	`1734`	`// Get the state in which we store previous writer, readers, ...`
`1735`		`- if (h_state.current_mode == access_mode::redux)`
	`1735`	`+ if (h_state.current_mode == access_mode::relaxed)`
`1736`	`1736`	`{`
`1737`	`1737`	`// Reconstruction of the data on the reference data place needed`
`1738`	`1738`
`@@ -1863,14 +1863,14 @@ inline event_list enforce_stf_deps_before(`
`1863`	`1863`	`auto& dot = *bctx.get_dot();`
`1864`	`1864`	`const bool dot_is_tracing = dot.is_tracing();`
`1865`	`1865`
`1866`		`- if (mode == access_mode::redux)`
	`1866`	`+ if (mode == access_mode::relaxed)`
`1867`	`1867`	`{`
`1868`	`1868`	`// A reduction only needs to wait for previous accesses on the data instance`
`1869`		`- ctx_.current_mode = access_mode::redux;`
	`1869`	`+ ctx_.current_mode = access_mode::relaxed;`
`1870`	`1870`
`1871`	`1871`	`if (dot_is_tracing)`
`1872`	`1872`	`{`
`1873`		`- // Add this task to the list of task accessing the logical data in redux mode`
	`1873`	`+ // Add this task to the list of task accessing the logical data in relaxed mode`
`1874`	`1874`	`// We only store its id since this is used for dot`
`1875`	`1875`	`ctx_.pending_redux_id.push_back(task.get_unique_id());`
`1876`	`1876`	`}`
`@@ -1882,13 +1882,13 @@ inline event_list enforce_stf_deps_before(`
`1882`	`1882`	`}`
`1883`	`1883`
`1884`	`1884`	`// This is not a reduction, but perhaps we need to reconstruct the data first?`
`1885`		`- if (ctx_.current_mode == access_mode::redux)`
	`1885`	`+ if (ctx_.current_mode == access_mode::relaxed)`
`1886`	`1886`	`{`
`1887`	`1887`	`assert(eplace.has_value());`
`1888`	`1888`	`if (dot_is_tracing)`
`1889`	`1889`	`{`
`1890`	`1890`	`// Add a dependency between previous tasks accessing the handle`
`1891`		`- // in redux mode, and this task which forces its`
	`1891`	`+ // in relaxed mode, and this task which forces its`
`1892`	`1892`	`// reconstruction.`
`1893`	`1893`	`for (const int redux_task_id : ctx_.pending_redux_id)`
`1894`	`1894`	`{`
`@@ -1998,7 +1998,7 @@ inline event_list enforce_stf_deps_before(`
`1998`	`1998`	`template <typename task_type>`
`1999`	`1999`	`inline void enforce_stf_deps_after(logical_data_untyped& handle, const task_type& task, const access_mode mode)`
`2000`	`2000`	`{`
`2001`		`- if (mode == access_mode::redux)`
	`2001`	`+ if (mode == access_mode::relaxed)`
`2002`	`2002`	`{`
`2003`	`2003`	`// no further action is required`
`2004`	`2004`	`return;`
`@@ -2295,9 +2295,9 @@ public:`
`2295`	`2295`	`}`
`2296`	`2296`
`2297`	`2297`	`template <typename... Pack>`
`2298`		`- task_dep<T> redux(Pack&&... pack)`
	`2298`	`+ task_dep<T> relaxed(Pack&&... pack)`
`2299`	`2299`	`{`
`2300`		`- return task_dep<T>(*this, access_mode::redux, ::std::forward<Pack>(pack)...);`
	`2300`	`+ return task_dep<T>(*this, access_mode::relaxed, ::std::forward<Pack>(pack)...);`
`2301`	`2301`	`}`
`2302`	`2302`	`///@}`
`2303`	`2303`	`};`
Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,7 @@ void dep_allocate(`
`448`	`448`	`}`
`449`	`449`
`450`	`450`	`// After allocating a reduction instance, we need to initialize it`
`451`		`- if (mode == access_mode::redux)`
	`451`	`+ if (mode == access_mode::relaxed)`
`452`	`452`	`{`
`453`	`453`	`assert(eplace.has_value());`
`454`	`454`	`// We have just allocated a new piece of data to perform`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ int main()`
`98`	`98`
`99`	`99`	`for (size_t dev_id = 0; dev_id < 4; dev_id++)`
`100`	`100`	`{`
`101`		`- ctx.task(h_handle.redux(fusion_op))->*[&](auto stream, auto h) {`
	`101`	`+ ctx.task(h_handle.relaxed(fusion_op))->*[&](auto stream, auto h) {`
`102`	`102`	`EXPECT(h.get_capacity() == 2048);`
`103`	`103`	`fill_table<<<32, 32, 0, stream>>>(dev_id, 10, h);`
`104`	`104`	`};`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ int main()`
`35`	`35`	`for (int i = 0; i < K; i++)`
`36`	`36`	`{`
`37`	`37`	`// Increment the variable by 1`
`38`		`- ctx.task(exec_place::device(i % ndevs), handle.redux(redux_op))->*[](auto stream, auto s) {`
	`38`	`+ ctx.task(exec_place::device(i % ndevs), handle.relaxed(redux_op))->*[](auto stream, auto s) {`
`39`	`39`	`add<<<1, 1, 0, stream>>>(s.data_handle());`
`40`	`40`	`};`
`41`	`41`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ int main()`
`50`	`50`	`// We add i (total = N(N-1)/2 + initial_value)`
`51`	`51`	`for (int i = 0; i < N; i++)`
`52`	`52`	`{`
`53`		`- ctx.task(var_handle.redux(redux_op))->*[=](cudaStream_t stream, auto d_var) {`
	`53`	`+ ctx.task(var_handle.relaxed(redux_op))->*[=](cudaStream_t stream, auto d_var) {`
`54`	`54`	`add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);`
`55`	`55`	`cuda_safe_call(cudaGetLastError());`
`56`	`56`	`};`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ int main()`
`29`	`29`	`// We add i (total = N(N-1)/2 + initial_value)`
`30`	`30`	`for (int i = 0; i < N; i++)`
`31`	`31`	`{`
`32`		`- ctx.parallel_for(var_handle.shape(), var_handle.redux(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {`
	`32`	`+ ctx.parallel_for(var_handle.shape(), var_handle.relaxed(op))->*[=] _CCCL_DEVICE(size_t ind, auto d_var) {`
`33`	`33`	`atomicAdd(d_var.data_handle(), i);`
`34`	`34`	`};`
`35`	`35`	`}`
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ int main()`
`92`	`92`	`// We add i (total = N(N-1)/2 + initial_value)`
`93`	`93`	`for (int i = 0; i < N; i++)`
`94`	`94`	`{`
`95`		`- ctx.task(var_handle.redux(redux_op))->*[&](cudaStream_t stream, auto d_var) {`
	`95`	`+ ctx.task(var_handle.relaxed(redux_op))->*[&](cudaStream_t stream, auto d_var) {`
`96`	`96`	`add_val<<<1, 1, 0, stream>>>(d_var.data_handle(), i);`
`97`	`97`	`};`
`98`	`98`	`}`
Original file line number	Diff line number	Diff line change
`@@ -98,9 +98,10 @@ int main()`
`98`	`98`
`99`	`99`	`for (int i = 0; i < N; i++)`
`100`	`100`	`{`
`101`		`- ctx.task(var_handle.redux(redux_op), array_handles[i].read())->*[](cudaStream_t stream, auto d_var, auto d_array_i) {`
`102`		`- add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());`
`103`		`- };`
	`101`	`+ ctx.task(var_handle.relaxed(redux_op), array_handles[i].read())`
	`102`	`+ ->*[](cudaStream_t stream, auto d_var, auto d_array_i) {`
	`103`	`+ add<<<1, 1, 0, stream>>>(d_array_i.data_handle(), d_var.data_handle());`
	`104`	`+ };`
`104`	`105`	`}`
`105`	`106`
`106`	`107`	`// Force the reconstruction of data on the device, so that no transfers are`
Original file line number	Diff line number	Diff line change
`@@ -41,13 +41,13 @@ int main()`
`41`	`41`	`// device`
`42`	`42`	`for (int d = 0; d < ndevs; d++)`
`43`	`43`	`{`
`44`		`- ctx.task(exec_place::device(d), var_handle.redux(redux_op))->*[=](cudaStream_t s, auto var) {`
	`44`	`+ ctx.task(exec_place::device(d), var_handle.relaxed(redux_op))->*[=](cudaStream_t s, auto var) {`
`45`	`45`	`add_val<int><<<1, 1, 0, s>>>(var, i);`
`46`	`46`	`};`
`47`	`47`	`}`
`48`	`48`
`49`	`49`	`// host`
`50`		`- ctx.host_launch(var_handle.redux(redux_op))->*[=](auto var) {`
	`50`	`+ ctx.host_launch(var_handle.relaxed(redux_op))->*[=](auto var) {`
`51`	`51`	`var(0) += i;`
`52`	`52`	`};`
`53`	`53`	`}`